In [2]:
# Core Python Utilities
import itertools
import warnings
# Data Handling
import numpy as np
import pandas as pd
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
# Sklearn: Models, Evaluation, and Preprocessing
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# PyTorch: Deep Learning Framework
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
# Ignore warnings
warnings.filterwarnings("ignore")
# Enable inline plotting (for Jupyter notebooks only)
%matplotlib inline
In [4]:
# Load datasets
data1 = pd.read_csv('/Users/janat/Desktop/ppimic50pred/dataset/ALL_PPIS_ic50_results.csv')
data2 = pd.read_csv('/Users/janat/Desktop/ppimic50pred/dataset/ChEMBL_PPIs_new_ic50_results.csv', sep="\t")
# Remove overlapping ChEMBL IDs
common_ids = set(data1['chembl_id']).intersection(data2['chembl_id'])
data1_unique = data1[~data1['chembl_id'].isin(common_ids)]
data2_unique = data2[~data2['chembl_id'].isin(common_ids)]
# Combine and clean
combined_data = pd.concat([data1_unique, data2_unique], ignore_index=True)
combined_data = combined_data[
combined_data['ic50_value'].notna() &
(combined_data['ic50_value'].astype(str).str.strip() != "")
]
# Convert to numeric and remove invalid IC50s
combined_data['ic50_value'] = pd.to_numeric(combined_data['ic50_value'], errors='coerce')
combined_data = combined_data[combined_data['ic50_value'] > 0]
# Compute log(IC50)
combined_data['log_ic50'] = np.log10(combined_data['ic50_value'])
combined_data
Out[4]:
| chembl_id | ic50_value | units | pchembl_value | target_chembl_id | target_name | assay_chembl_id | activity_comment | log_ic50 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | CHEMBL7976 | 1410.0 | nM | 5.85 | CHEMBL2039 | Monoamine oxidase B | CHEMBL1003729 | NaN | 3.149219 |
| 1 | CHEMBL1094278 | 300.0 | nM | 6.52 | CHEMBL614361 | T47D | CHEMBL1109973 | NaN | 2.477121 |
| 2 | CHEMBL1097571 | 230.0 | nM | 6.64 | CHEMBL614361 | T47D | CHEMBL1109973 | NaN | 2.361728 |
| 3 | CHEMBL121516 | 30.0 | nM | 7.52 | CHEMBL387 | MCF7 | CHEMBL1109970 | NaN | 1.477121 |
| 4 | CHEMBL1236726 | 190.0 | nM | 6.72 | CHEMBL5023 | p53-binding protein Mdm-2 | CHEMBL3391053 | NaN | 2.278754 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3449 | CHEMBL1200969 | 4.0 | nM | 8.40 | CHEMBL1787 | Steroid 5-alpha-reductase 1 | CHEMBL4371136 | NaN | 0.602060 |
| 3450 | CHEMBL2311933 | 100.0 | nM | 7.00 | CHEMBL3885518 | Aryl hydrocarbon receptor nuclear translocator... | CHEMBL2319602 | NaN | 2.000000 |
| 3451 | CHEMBL2311967 | 90.0 | nM | 7.05 | CHEMBL3885518 | Aryl hydrocarbon receptor nuclear translocator... | CHEMBL2319602 | NaN | 1.954243 |
| 3452 | CHEMBL3235901 | 1400.0 | nM | 5.85 | CHEMBL3414417 | Histone acetyltransferase RTT109 | CHEMBL3419139 | NaN | 3.146128 |
| 3453 | CHEMBL4446667 | 770.0 | nM | 6.11 | CHEMBL4105786 | B-cell lymphoma 6 protein | CHEMBL4380816 | NaN | 2.886491 |
3454 rows × 9 columns
RDKit Descriptor¶
In [7]:
# Load RDKit descriptor data
rdkit_data = pd.read_csv('/Users/janat/Desktop/ppimic50pred/dataset/chembl_rdkit_descriptors.csv')
# Merge with IC50 data
merged_df_rdkit = pd.merge(
rdkit_data,
combined_data[['chembl_id', 'ic50_value', 'log_ic50', 'target_chembl_id', 'target_name']],
on='chembl_id',
how='inner'
)
merged_df_rdkit
Out[7]:
| MaxAbsEStateIndex | MaxEStateIndex | MinAbsEStateIndex | MinEStateIndex | qed | SPS | MolWt | HeavyAtomMolWt | ExactMolWt | NumValenceElectrons | ... | fr_thiocyan | fr_thiophene | fr_unbrch_alkane | fr_urea | chembl_id | smiles | ic50_value | log_ic50 | target_chembl_id | target_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11.731188 | 11.731188 | 0.031901 | 0.031901 | 0.556230 | 10.500000 | 208.260 | 196.164 | 208.088815 | 78 | ... | 0 | 0 | 0 | 0 | CHEMBL7976 | O=C(/C=C/c1ccccc1)c1ccccc1 | 1410.0 | 3.149219 | CHEMBL2039 | Monoamine oxidase B |
| 1 | 8.541096 | 8.541096 | 0.220764 | 0.220764 | 0.437912 | 9.916667 | 225.404 | 210.284 | 225.031577 | 72 | ... | 0 | 0 | 0 | 0 | CHEMBL1094278 | CCN(CC)C(=S)SSCCO | 300.0 | 2.477121 | CHEMBL614361 | T47D |
| 2 | 5.342645 | 5.342645 | 1.026516 | 1.026516 | 0.358832 | 10.333333 | 265.513 | 242.329 | 265.099263 | 90 | ... | 0 | 0 | 3 | 0 | CHEMBL1097571 | CCCCCCSSC(=S)N(CC)CC | 230.0 | 2.361728 | CHEMBL614361 | T47D |
| 3 | 5.483680 | 5.483680 | 1.013018 | 1.013018 | 0.489495 | 20.888889 | 320.574 | 300.414 | 320.050933 | 102 | ... | 0 | 0 | 0 | 0 | CHEMBL121516 | S=C(SSC(=S)N1CCCCC1)N1CCCCC1 | 30.0 | 1.477121 | CHEMBL387 | MCF7 |
| 4 | 14.436125 | 14.436125 | 0.004844 | -0.004844 | 0.178353 | 15.250000 | 629.636 | 591.332 | 628.248415 | 228 | ... | 0 | 0 | 0 | 0 | CHEMBL1236726 | CN(C)CCCN(C)[C@H]1CCN(C(=O)c2[nH]c3cc(Cl)ccc3c... | 190.0 | 2.278754 | CHEMBL5023 | p53-binding protein Mdm-2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3446 | 13.565688 | 13.565688 | 0.053522 | -4.912768 | 0.426542 | 37.297297 | 528.537 | 498.297 | 528.221148 | 202 | ... | 0 | 0 | 0 | 0 | CHEMBL1200969 | C[C@]12C=CC(=O)N[C@@H]1CC[C@@H]1[C@@H]2CC[C@]2... | 4.0 | 0.602060 | CHEMBL1787 | Steroid 5-alpha-reductase 1 |
| 3447 | 13.297583 | 13.297583 | 0.006944 | -0.615307 | 0.585759 | 10.761905 | 308.656 | 302.608 | 308.011246 | 106 | ... | 0 | 0 | 0 | 0 | CHEMBL2311933 | O=[N+]([O-])c1c(Nc2cc(F)cc(Cl)c2)ccc2nonc12 | 100.0 | 2.000000 | CHEMBL3885518 | Aryl hydrocarbon receptor nuclear translocator... |
| 3448 | 12.890514 | 12.890514 | 0.022566 | -4.652088 | 0.545055 | 11.666667 | 358.663 | 352.615 | 358.008052 | 124 | ... | 0 | 0 | 0 | 0 | CHEMBL2311967 | O=[N+]([O-])c1c(Nc2ccc(Cl)c(C(F)(F)F)c2)ccc2no... | 90.0 | 1.954243 | CHEMBL3885518 | Aryl hydrocarbon receptor nuclear translocator... |
| 3449 | 13.029538 | 13.029538 | 0.045958 | -3.817196 | 0.373880 | 11.833333 | 440.550 | 420.390 | 440.097683 | 154 | ... | 0 | 0 | 0 | 0 | CHEMBL3235901 | CC(C)c1ccc(S(=O)(=O)Nc2cc(Sc3nc[nH]n3)c(O)c3cc... | 1400.0 | 3.146128 | CHEMBL3414417 | Histone acetyltransferase RTT109 |
| 3450 | 14.145788 | 14.145788 | 0.115808 | -0.521369 | 0.867144 | 20.440000 | 365.840 | 344.672 | 365.141866 | 134 | ... | 0 | 0 | 0 | 0 | CHEMBL4446667 | COc1ccc(Nc2nc(N3C[C@@H](C)N[C@@H](C)C3)ncc2F)c... | 770.0 | 2.886491 | CHEMBL4105786 | B-cell lymphoma 6 protein |
3451 rows × 223 columns
PubChem Descriptor¶
In [10]:
import pandas as pd
# Step 1: Load the cleaned IC50 + metadata + mapping
mapping_dataset = pd.read_excel('/Users/janat/Desktop/ppimic50pred/dataset/dataset.xlsx')
# Ensure required columns exist
required_cols = ['chembl_id', 'pubchem_id', 'ic50_value', 'target_chembl_id', 'target_name']
missing_cols = [col for col in required_cols if col not in mapping_dataset.columns]
if missing_cols:
raise ValueError(f"Missing columns in mapping dataset: {missing_cols}")
# Add log_ic50
mapping_dataset['log_ic50'] = pd.to_numeric(mapping_dataset['ic50_value'], errors='coerce').apply(lambda x: np.log10(x) if pd.notna(x) and x > 0 else None)
mapping_dataset = mapping_dataset.dropna(subset=['log_ic50'])
# Keep only first unique pubchem_id
pubchem_info = mapping_dataset[['pubchem_id', 'chembl_id', 'ic50_value', 'log_ic50', 'target_chembl_id', 'target_name']].drop_duplicates(subset='pubchem_id', keep='first')
# Step 2: Load PubChem descriptor data
pubchem_desc = pd.read_csv('/Users/janat/Desktop/ppimic50pred/dataset/full_pubchem_output.csv')
pubchem_desc = pubchem_desc.drop_duplicates(subset='CID', keep='first')
# Step 3: Merge PubChem descriptors with mapped IC50 info (CID ↔ pubchem_id)
merged_df_pubchem = pd.merge(
pubchem_desc,
pubchem_info,
left_on='CID',
right_on='pubchem_id',
how='inner'
).drop(columns=['pubchem_id'])
print("✅ Final merged_df_pubchem shape:", merged_df_pubchem.shape)
merged_df_pubchem
✅ Final merged_df_pubchem shape: (3444, 23)
Out[10]:
| CID | MW | XLogP3 | HBDC | HDAC | RBC | ExactMass | MonoMass | TPSA | HAC | ... | DASC | UASC | DBSC | UBSC | CBU | chembl_id | ic50_value | log_ic50 | target_chembl_id | target_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 637760 | 208.25 | 3.1 | 0 | 1 | 3 | 208.088815 | 208.088815 | 17.1 | 16 | ... | 0 | 0 | 1 | 0 | 1 | CHEMBL7976 | 1410.0 | 3.149219 | CHEMBL2039 | Monoamine oxidase B |
| 1 | 46700806 | 225.40 | 1.4 | 1 | 4 | 6 | 225.031578 | 225.031578 | 106.0 | 12 | ... | 0 | 0 | 0 | 0 | 1 | CHEMBL1094278 | 300.0 | 2.477121 | CHEMBL614361 | T47D |
| 2 | 46700805 | 265.50 | 4.4 | 0 | 3 | 9 | 265.099263 | 265.099263 | 85.9 | 15 | ... | 0 | 0 | 0 | 0 | 1 | CHEMBL1097571 | 230.0 | 2.361728 | CHEMBL614361 | T47D |
| 3 | 7188 | 320.60 | 3.4 | 0 | 4 | 3 | 320.050933 | 320.050933 | 121.0 | 18 | ... | 0 | 0 | 0 | 0 | 1 | CHEMBL121516 | 30.0 | 1.477121 | CHEMBL387 | MCF7 |
| 4 | 24969086 | 629.60 | 6.7 | 1 | 4 | 10 | 628.248415 | 628.248415 | 60.4 | 44 | ... | 1 | 0 | 0 | 0 | 1 | CHEMBL1236726 | 190.0 | 2.278754 | CHEMBL5023 | p53-binding protein Mdm-2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3439 | 6918296 | 528.50 | 5.4 | 2 | 8 | 2 | 528.221147 | 528.221147 | 58.2 | 37 | ... | 7 | 0 | 0 | 0 | 1 | CHEMBL1200969 | 4.0 | 0.602060 | CHEMBL1787 | Steroid 5-alpha-reductase 1 |
| 3440 | 70697712 | 308.65 | 3.8 | 1 | 7 | 2 | 308.011246 | 308.011246 | 96.8 | 21 | ... | 0 | 0 | 0 | 0 | 1 | CHEMBL2311933 | 100.0 | 2.000000 | CHEMBL3885518 | Aryl hydrocarbon receptor nuclear translocator... |
| 3441 | 2842377 | 358.66 | 4.6 | 1 | 9 | 2 | 358.008052 | 358.008052 | 96.8 | 24 | ... | 0 | 0 | 0 | 0 | 1 | CHEMBL2311967 | 90.0 | 1.954243 | CHEMBL3885518 | Aryl hydrocarbon receptor nuclear translocator... |
| 3442 | 2133805 | 440.50 | 4.9 | 3 | 7 | 6 | 440.097683 | 440.097683 | 142.0 | 30 | ... | 0 | 0 | 0 | 0 | 1 | CHEMBL3235901 | 1400.0 | 3.146128 | CHEMBL3414417 | Histone acetyltransferase RTT109 |
| 3443 | 155518808 | 365.80 | 3.4 | 2 | 7 | 4 | 365.141866 | 365.141866 | 62.3 | 25 | ... | 2 | 0 | 0 | 0 | 1 | CHEMBL4446667 | 770.0 | 2.886491 | CHEMBL4105786 | B-cell lymphoma 6 protein |
3444 rows × 23 columns
PaDEL Descriptor¶
In [13]:
# Load PaDEL-descriptor data
#The software currently calculates 1875 descriptors (1444 1D, 2D descriptors and 431 3D descriptors)
padel_data = pd.read_csv('/Users/janat/Desktop/ppimic50pred/dataset/Padel_descriptor.csv')
# Extract final IC50 + metadata from PubChem-merged result
padel_ic50 = merged_df_pubchem[['CID', 'chembl_id', 'ic50_value', 'log_ic50', 'target_chembl_id', 'target_name']]
padel_ic50_unique = padel_ic50.drop_duplicates(subset='CID', keep='first')
# Merge PADEL on Name (same as CID)
merged_df_padel = pd.merge(
padel_data,
padel_ic50_unique,
left_on='Name',
right_on='CID',
how='inner'
).drop(columns=['CID'])
merged_df_padel
Out[13]:
| Name | nAcid | ALogP | ALogp2 | AMR | apol | naAromAtom | nAromBond | nAtom | nHeavyAtom | ... | Ts | As | Vs | Ks | Ds | chembl_id | ic50_value | log_ic50 | target_chembl_id | target_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 811 | 2 | 0.0671 | 0.004502 | 27.9149 | 16.008758 | 0 | 0 | 15 | 9 | ... | 6.399732 | 7.830642 | 14.810332 | 0.622780 | 1.073624 | CHEMBL359159 | 29400.0 | 4.468347 | CHEMBL4295587 | Isocitrate lyase 1 |
| 1 | 1110 | 2 | -0.4100 | 0.168100 | 23.5370 | 14.248758 | 0 | 0 | 14 | 8 | ... | 6.025117 | 5.529483 | 12.312929 | 0.728858 | 1.290113 | CHEMBL576 | 1.3 | 0.113943 | CHEMBL391 | Vero |
| 2 | 1983 | 0 | -0.2314 | 0.053546 | 44.7549 | 22.785137 | 6 | 6 | 20 | 11 | ... | 8.920007 | 12.727717 | 25.337999 | 0.715850 | 1.310099 | CHEMBL112 | 2300.0 | 3.361728 | CHEMBL2406892 | Myoglobin |
| 3 | 2259 | 3 | 0.6249 | 0.390500 | 114.4665 | 55.273102 | 0 | 0 | 45 | 31 | ... | 18.633544 | 102.834273 | 277.835978 | 0.298418 | 1.357003 | CHEMBL275938 | 12.0 | 1.079181 | CHEMBL1075138 | Tyrosyl-DNA phosphodiesterase 1 |
| 4 | 2333 | 0 | 1.4819 | 2.196028 | 99.6746 | 46.427516 | 0 | 0 | 34 | 22 | ... | 14.048063 | 51.655693 | 107.462124 | 0.419801 | 1.098138 | CHEMBL388590 | 16.0 | 1.204120 | CHEMBL3397 | Cytochrome P450 2C9 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2921 | 172465382 | 0 | 1.7263 | 2.980112 | 137.1516 | 66.582653 | 0 | 0 | 55 | 34 | ... | 27.733922 | 169.817591 | 380.819133 | 0.555319 | 1.488703 | CHEMBL5426754 | 5690.0 | 3.755112 | CHEMBL5465553 | WDR5-MYC |
| 2922 | 172467441 | 1 | 1.2620 | 1.592644 | 146.5647 | 70.445274 | 0 | 0 | 57 | 39 | ... | 29.379851 | 205.511883 | 493.020547 | 0.497470 | 1.501161 | CHEMBL5435549 | 190.0 | 2.278754 | CHEMBL5465553 | WDR5-MYC |
| 2923 | 172468432 | 0 | 0.6934 | 0.480804 | 122.5689 | 64.396239 | 0 | 0 | 55 | 32 | ... | 26.462041 | 143.801164 | 309.854220 | 0.601459 | 1.444163 | CHEMBL5435031 | 3830.0 | 3.583199 | CHEMBL5465553 | WDR5-MYC |
| 2924 | 172468926 | 2 | -2.5166 | 6.333276 | 139.0980 | 69.040653 | 0 | 0 | 57 | 36 | ... | 20.025451 | 112.390718 | 272.351175 | 0.370047 | 1.437387 | CHEMBL5438828 | 890.0 | 2.949390 | CHEMBL3038498 | Keap1/Nrf2 |
| 2925 | 172471290 | 1 | 0.6539 | 0.427585 | 127.4106 | 65.521653 | 0 | 0 | 56 | 35 | ... | 27.433477 | 176.050965 | 435.621833 | 0.520817 | 1.570948 | CHEMBL5440333 | 640.0 | 2.806180 | CHEMBL5465553 | WDR5-MYC |
2926 rows × 1881 columns
In [14]:
unique_targets_padel = merged_df_padel['target_name'].nunique()
unique_targets_rdkit = merged_df_rdkit['target_name'].nunique()
unique_targets_pubchem = merged_df_pubchem['target_name'].nunique()
print("Number of unique target_name entries:")
print(f" PaDEL: {unique_targets_padel}")
print(f" RDKit: {unique_targets_rdkit}")
print(f" PubChem: {unique_targets_pubchem}")
Number of unique target_name entries: PaDEL: 151 RDKit: 176 PubChem: 175
In [17]:
from rich.console import Console
from rich.table import Table
console = Console()
# Compute presence sets
padel_targets = set(merged_df_padel['target_name'].unique())
rdkit_targets = set(merged_df_rdkit['target_name'].unique())
pubchem_targets = set(merged_df_pubchem['target_name'].unique())
# Compute unique chembl_id counts per target_name for each dataset
padel_counts = merged_df_padel.groupby('target_name')['chembl_id'].nunique().to_dict()
rdkit_counts = merged_df_rdkit.groupby('target_name')['chembl_id'].nunique().to_dict()
pubchem_counts = merged_df_pubchem.groupby('target_name')['chembl_id'].nunique().to_dict()
all_targets = sorted(padel_targets | rdkit_targets | pubchem_targets)
def style_mark(present):
return "[green]✓[/green]" if present else "[red]×[/red]"
table = Table(title="")
# Add columns: presence and counts side-by-side
table.add_column("target_name", style="bold")
table.add_column("PaDEL")
table.add_column("PaDEL Count", justify="right")
table.add_column("RDKit")
table.add_column("RDKit Count", justify="right")
table.add_column("PubChem")
table.add_column("PubChem Count", justify="right")
for target in all_targets:
padel_present = target in padel_targets
rdkit_present = target in rdkit_targets
pubchem_present = target in pubchem_targets
table.add_row(
target,
style_mark(padel_present),
str(padel_counts.get(target, 0)) if padel_present else "-",
style_mark(rdkit_present),
str(rdkit_counts.get(target, 0)) if rdkit_present else "-",
style_mark(pubchem_present),
str(pubchem_counts.get(target, 0)) if pubchem_present else "-",
)
console.print(table)
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ target_name ┃ PaDEL ┃ PaDEL Count ┃ RDKit ┃ RDKit Count ┃ PubChem ┃ PubChem Count ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ A549 │ ✓ │ 15 │ ✓ │ 15 │ ✓ │ 15 │ │ ACHN │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Acetylcholinesterase │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Acyl coenzyme A:cholesterol │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ acyltransferase 1 │ │ │ │ │ │ │ │ Androgen receptor/PELP1 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Apoptosis regulator Bcl-2 │ ✓ │ 64 │ ✓ │ 122 │ ✓ │ 124 │ │ Apoptosis regulator Bcl-W │ ✓ │ 3 │ ✓ │ 3 │ ✓ │ 3 │ │ Apoptosis regulator Bcl-X │ ✓ │ 124 │ ✓ │ 212 │ ✓ │ 212 │ │ Apoptotic protease-activating factor 1 │ ✓ │ 2 │ ✓ │ 2 │ ✓ │ 2 │ │ Arachidonate 12-lipoxygenase │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Aryl hydrocarbon receptor nuclear │ ✓ │ 53 │ ✓ │ 53 │ ✓ │ 53 │ │ translocator/Endothelial PAS │ │ │ │ │ │ │ │ domain-containing protein 1 │ │ │ │ │ │ │ │ B-cell lymphoma 6 protein │ ✓ │ 35 │ ✓ │ 35 │ ✓ │ 35 │ │ BACH1/MafK │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ BCL-6/NCOR2 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ BCoR-BCL6 │ ✓ │ 15 │ ✓ │ 15 │ ✓ │ 15 │ │ Bcl-2-related protein A1 │ ✓ │ 5 │ ✓ │ 5 │ ✓ │ 5 │ │ Bcl-xL/Bcl-2-binding component 3 │ × │ - │ ✓ │ 1 │ ✓ │ 1 │ │ Bcl2-antagonist of cell death (BAD) │ ✓ │ 2 │ ✓ │ 3 │ ✓ │ 3 │ │ Beta Lactamase │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Bromodomain adjacent to zinc finger │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ domain protein 2A │ │ │ │ │ │ │ │ Bromodomain adjacent to zinc finger │ ✓ │ 2 │ ✓ │ 2 │ ✓ │ 2 │ │ domain protein 2B │ │ │ │ │ │ │ │ Bromodomain-containing protein 2 │ ✓ │ 17 │ ✓ │ 17 │ ✓ │ 17 │ │ Bromodomain-containing protein 3 │ ✓ │ 31 │ ✓ │ 31 │ ✓ │ 31 │ │ Bromodomain-containing protein 4 │ ✓ │ 10 │ ✓ │ 10 │ ✓ │ 10 │ │ CBP/beta catenin │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ CD40-CD40L │ ✓ │ 5 │ ✓ │ 6 │ ✓ │ 6 │ │ CRBN/EZH2 │ × │ - │ ✓ │ 2 │ ✓ │ 2 │ │ CREB-binding protein │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ CREB-binding protein/p53 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Cannabinoid CB1 receptor │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Casein kinase I delta │ ✓ │ 2 │ ✓ │ 2 │ ✓ │ 2 │ │ Catenin beta-1/Transcription factor 7 │ ✓ │ 2 │ ✓ │ 2 │ ✓ │ 2 │ │ Cellular tumor antigen p53/Replication │ × │ - │ ✓ │ 1 │ ✓ │ 1 │ │ protein A 70 kDa DNA-binding subunit │ │ │ │ │ │ │ │ Cereblon/Histone deacetylase 6 │ × │ - │ ✓ │ 1 │ ✓ │ 1 │ │ Cruzipain │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Cyclin-dependent kinase 2 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Cyclophilin A │ ✓ │ 80 │ ✓ │ 90 │ ✓ │ 89 │ │ Cyclophilin B │ × │ - │ ✓ │ 2 │ ✓ │ 2 │ │ Cysteinyl leukotriene receptor 1 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Cytochrome P450 2C9 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Cytochrome P450 3A4 │ ✓ │ 2 │ ✓ │ 2 │ ✓ │ 2 │ │ DNA (cytosine-5)-methyltransferase 1 │ ✓ │ 2 │ ✓ │ 2 │ ✓ │ 2 │ │ DNA repair protein RAD51 homolog 1 │ ✓ │ 16 │ ✓ │ 16 │ ✓ │ 16 │ │ DU-145 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Daudi │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Death-associated protein kinase 1 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Dipeptidyl peptidase IV │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Dopamine D3 receptor │ × │ - │ ✓ │ 1 │ ✓ │ 1 │ │ EOL1 │ × │ - │ ✓ │ 2 │ ✓ │ 2 │ │ Entamoeba histolytica │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Estrogen receptor alpha │ × │ - │ ✓ │ 2 │ ✓ │ 2 │ │ Estrogen receptor beta │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Estrogen-related receptor alpha │ × │ - │ ✓ │ 1 │ ✓ │ 1 │ │ Fanconi anemia group F │ ✓ │ 1 │ ✓ │ 2 │ ✓ │ 2 │ │ protein/Transcription factor HES-1 │ │ │ │ │ │ │ │ G-protein coupled receptor 35 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ G-protein coupled receptor 55 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Giardia intestinalis │ ✓ │ 3 │ ✓ │ 3 │ ✓ │ 3 │ │ Glycogen synthase kinase-3 alpha │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ H9c2 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ HCT-116 │ ✓ │ 27 │ ✓ │ 35 │ ✓ │ 35 │ │ HIF1A/p300/CREB-binding protein │ ✓ │ 3 │ ✓ │ 3 │ ✓ │ 3 │ │ HL-60 │ ✓ │ 5 │ ✓ │ 5 │ ✓ │ 5 │ │ HeLa │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ HepG2 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Hepatitis C virus │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Histone acetyltransferase RTT109 │ ✓ │ 2 │ ✓ │ 2 │ ✓ │ 2 │ │ Histone-lysine N-methyltransferase │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Histone-lysine N-methyltransferase │ ✓ │ 33 │ ✓ │ 33 │ ✓ │ 33 │ │ 2A/WDR5 │ │ │ │ │ │ │ │ Histone-lysine N-methyltransferase EZH2 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Human immunodeficiency virus type 1 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ integrase │ │ │ │ │ │ │ │ IL15-IL15 receptor │ ✓ │ 12 │ ✓ │ 17 │ ✓ │ 17 │ │ Indoleamine 2 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Induced myeloid leukemia cell │ ✓ │ 26 │ ✓ │ 30 │ ✓ │ 30 │ │ differentiation protein Mcl-1 │ │ │ │ │ │ │ │ Integrin alpha-V/beta-3 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Interleukin-15 │ ✓ │ 2 │ ✓ │ 2 │ ✓ │ 2 │ │ Isocitrate lyase 1 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ J774.A1 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ JAR │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Jurkat │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ K562 │ ✓ │ 2 │ ✓ │ 2 │ ✓ │ 2 │ │ KB │ ✓ │ 4 │ ✓ │ 4 │ ✓ │ 4 │ │ Keap1-p62 │ ✓ │ 10 │ ✓ │ 10 │ ✓ │ 10 │ │ Keap1/Nrf2 │ ✓ │ 189 │ ✓ │ 213 │ ✓ │ 213 │ │ Kelch-like ECH-associated protein 1 │ ✓ │ 9 │ ✓ │ 12 │ ✓ │ 12 │ │ LNCaP │ ✓ │ 2 │ ✓ │ 2 │ ✓ │ 2 │ │ LNCaP C4-2B │ × │ - │ ✓ │ 1 │ ✓ │ 1 │ │ Leishmania donovani │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Leishmania infantum │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Lysine-specific histone demethylase 1 │ ✓ │ 14 │ ✓ │ 17 │ ✓ │ 14 │ │ M17 leucyl aminopeptidase │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ M18 aspartyl aminopeptidase │ ✓ │ 1 │ ✓ │ 2 │ ✓ │ 2 │ │ MAP kinase p38 alpha │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ MCF7 │ ✓ │ 6 │ ✓ │ 6 │ ✓ │ 6 │ │ MDA-MB-231 │ ✓ │ 5 │ ✓ │ 5 │ ✓ │ 5 │ │ MLL1-ASH2L/RbBP5/WDR5/DPY30 │ ✓ │ 18 │ ✓ │ 18 │ ✓ │ 18 │ │ MOLM-13 │ ✓ │ 4 │ ✓ │ 4 │ ✓ │ 4 │ │ MV4-11 │ ✓ │ 6 │ ✓ │ 7 │ ✓ │ 7 │ │ Matrix metalloproteinase 13 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Menin │ ✓ │ 23 │ ✓ │ 46 │ ✓ │ 46 │ │ Menin/Histone-lysine N-methyltransferase │ ✓ │ 111 │ ✓ │ 142 │ ✓ │ 142 │ │ MLL │ │ │ │ │ │ │ │ Monoamine oxidase B │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Mothers against decapentaplegic homolog 3 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Myc proto-oncogene protein │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Myoglobin │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ NAD(+) hydrolase SARM1 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ NCI-H1417 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ NCI-H1963 │ × │ - │ ✓ │ 1 │ ✓ │ 1 │ │ NCI-H23 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ NIH3T3 │ ✓ │ 12 │ ✓ │ 12 │ ✓ │ 12 │ │ NON-PROTEIN TARGET │ ✓ │ 2 │ ✓ │ 2 │ ✓ │ 2 │ │ NRP2-VEGA │ × │ - │ ✓ │ 1 │ ✓ │ 1 │ │ Nitric oxide synthase, inducible │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Nuclear receptor subfamily 4 group A │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ member 1 │ │ │ │ │ │ │ │ Orexin receptor 1 │ ✓ │ 3 │ ✓ │ 3 │ ✓ │ 3 │ │ PBMC │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ PRMT5/MEP50 complex │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ PTPN9/STAT3 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Peptidyl-prolyl cis-trans isomerase │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ NIMA-interacting 1 │ │ │ │ │ │ │ │ Peroxisome proliferator-activated │ ✓ │ 118 │ ✓ │ 122 │ ✓ │ 122 │ │ receptor gamma/Nuclear receptor │ │ │ │ │ │ │ │ corepressor 2 │ │ │ │ │ │ │ │ Phospholipase A2 group 1B │ ✓ │ 2 │ ✓ │ 2 │ ✓ │ 2 │ │ Photoreceptor-specific nuclear receptor │ ✓ │ 62 │ ✓ │ 62 │ ✓ │ 62 │ │ Plasmodium falciparum │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Prion protein │ ✓ │ 8 │ ✓ │ 8 │ ✓ │ 8 │ │ Proteasome assembly chaperone 3 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Protein Mdm4 │ ✓ │ 1 │ ✓ │ 7 │ ✓ │ 7 │ │ Protein arginine N-methyltransferase 3 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Protein cereblon/DNA damage-binding │ × │ - │ ✓ │ 3 │ ✓ │ 3 │ │ protein 1 │ │ │ │ │ │ │ │ Protein cereblon/Estrogen receptor │ × │ - │ ✓ │ 12 │ ✓ │ 12 │ │ Protein cereblon/Histone deacetylase 3 │ × │ - │ ✓ │ 1 │ ✓ │ 1 │ │ Protein skinhead-1 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Protein tyrosine kinase 2 beta │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Protein-tyrosine phosphatase LC-PTP │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Putative hexokinase HKDC1 │ ✓ │ 2 │ ✓ │ 2 │ ✓ │ 2 │ │ RAW264.7 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Raji │ ✓ │ 4 │ ✓ │ 4 │ ✓ │ 4 │ │ Ras and Rab interactor 1/Tyrosine-protein │ ✓ │ 3 │ ✓ │ 3 │ ✓ │ 3 │ │ kinase ABL1 │ │ │ │ │ │ │ │ Runt-related transcription factor │ ✓ │ 56 │ ✓ │ 58 │ ✓ │ 58 │ │ 1/Core-binding factor subunit beta │ │ │ │ │ │ │ │ SAOS-2 │ ✓ │ 10 │ ✓ │ 10 │ ✓ │ 10 │ │ SGC-7901 │ ✓ │ 6 │ ✓ │ 6 │ ✓ │ 6 │ │ SJSA-1 │ ✓ │ 6 │ ✓ │ 6 │ ✓ │ 6 │ │ SK-MEL-30 │ × │ - │ ✓ │ 1 │ ✓ │ 1 │ │ SUMO-activating enzyme │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ SW-620 │ ✓ │ 2 │ ✓ │ 2 │ ✓ │ 2 │ │ Sentrin-specific protease 6 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Sentrin-specific protease 7 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Serine/threonine protein phosphatase 2B │ × │ - │ ✓ │ 1 │ × │ - │ │ catalytic subunit │ │ │ │ │ │ │ │ Serine/threonine-protein kinase Chk1 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Serine/threonine-protein kinase PIM1 │ ✓ │ 4 │ ✓ │ 4 │ ✓ │ 4 │ │ Serine/threonine-protein kinase PIM2 │ ✓ │ 11 │ ✓ │ 11 │ ✓ │ 11 │ │ Steroid 5-alpha-reductase 1 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ T-cell │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ T47D │ ✓ │ 2 │ ✓ │ 2 │ ✓ │ 2 │ │ Transcription intermediary factor 1-alpha │ ✓ │ 1 │ ✓ │ 2 │ ✓ │ 2 │ │ Transcriptional coactivator │ × │ - │ ✓ │ 20 │ ✓ │ 20 │ │ YAP1/Transcriptional enhancer factor │ │ │ │ │ │ │ │ TEF-1 │ │ │ │ │ │ │ │ Transcriptional enhancer factor TEF-3 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ Tumor necrosis factor ligand superfamily │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ member 11 │ │ │ │ │ │ │ │ Tumour suppressor p53/oncoprotein Mdm2 │ ✓ │ 1051 │ ✓ │ 1137 │ ✓ │ 1135 │ │ Tumour suppressor protein p53/Mdm4 │ ✓ │ 12 │ ✓ │ 13 │ ✓ │ 13 │ │ Tyrosyl-DNA phosphodiesterase 1 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ U2OS │ ✓ │ 7 │ ✓ │ 7 │ ✓ │ 7 │ │ Unchecked │ ✓ │ 91 │ ✓ │ 135 │ ✓ │ 135 │ │ V79 │ ✓ │ 1 │ ✓ │ 1 │ ✓ │ 1 │ │ VEGA-NRP1 │ × │ - │ ✓ │ 16 │ ✓ │ 16 │ │ VHL/Estrogen receptor │ × │ - │ ✓ │ 3 │ ✓ │ 3 │ │ VHL/Histone deacetylase 3 │ × │ - │ ✓ │ 1 │ ✓ │ 1 │ │ VHL/KDM1A/HDAC1/RCOR1 │ × │ - │ ✓ │ 1 │ ✓ │ 1 │ │ Vero │ ✓ │ 3 │ ✓ │ 3 │ ✓ │ 3 │ │ Voltage-gated N-type calcium channel │ ✓ │ 3 │ ✓ │ 3 │ ✓ │ 3 │ │ alpha-1B subunit/Amyloid beta A4 │ │ │ │ │ │ │ │ precursor protein-binding family A member │ │ │ │ │ │ │ │ 1 │ │ │ │ │ │ │ │ Von Hippel-Lindau disease tumor │ × │ - │ ✓ │ 3 │ ✓ │ 3 │ │ suppressor/Elongin B/Elongin C │ │ │ │ │ │ │ │ WD repeat-containing protein 5 │ ✓ │ 3 │ ✓ │ 4 │ ✓ │ 4 │ │ WDR5-MYC │ ✓ │ 27 │ ✓ │ 27 │ ✓ │ 27 │ │ YAP1-TEAD4 │ ✓ │ 6 │ ✓ │ 6 │ ✓ │ 6 │ │ beta-catenin-B-cell lymphoma 9 protein │ × │ - │ ✓ │ 1 │ ✓ │ 1 │ │ complex │ │ │ │ │ │ │ │ c-Myc/c-Max │ ✓ │ 11 │ ✓ │ 11 │ ✓ │ 11 │ │ p53-binding protein Mdm-2 │ ✓ │ 318 │ ✓ │ 356 │ ✓ │ 354 │ │ von Hippel-Lindau disease tumor │ × │ - │ ✓ │ 1 │ ✓ │ 1 │ │ suppressor/Elongin-B/Elongin-C/Bromodoma… │ │ │ │ │ │ │ │ protein 4 │ │ │ │ │ │ │ └───────────────────────────────────────────┴───────┴─────────────┴───────┴─────────────┴─────────┴───────────────┘
Comparison table save to word file¶
In [20]:
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
# Prepare data as before
padel_targets = set(merged_df_padel['target_name'].unique())
rdkit_targets = set(merged_df_rdkit['target_name'].unique())
pubchem_targets = set(merged_df_pubchem['target_name'].unique())
padel_counts = merged_df_padel.groupby('target_name')['chembl_id'].nunique().to_dict()
rdkit_counts = merged_df_rdkit.groupby('target_name')['chembl_id'].nunique().to_dict()
pubchem_counts = merged_df_pubchem.groupby('target_name')['chembl_id'].nunique().to_dict()
all_targets = sorted(padel_targets | rdkit_targets | pubchem_targets)
def presence_mark(present):
return "✓" if present else "×"
# Create a Word document
doc = Document()
doc.add_heading('Target Name Comparison with ChEMBL Counts', level=1)
# Add a table: columns = target_name + 6 (PaDEL presence/count, RDKit presence/count, PubChem presence/count)
table = doc.add_table(rows=1, cols=7)
table.style = 'Light List Accent 1'
hdr_cells = table.rows[0].cells
hdr_cells[0].text = 'Target Name'
hdr_cells[1].text = 'PaDEL'
hdr_cells[2].text = 'PaDEL Count'
hdr_cells[3].text = 'RDKit'
hdr_cells[4].text = 'RDKit Count'
hdr_cells[5].text = 'PubChem'
hdr_cells[6].text = 'PubChem Count'
for target in all_targets:
row_cells = table.add_row().cells
row_cells[0].text = target
padel_present = target in padel_targets
rdkit_present = target in rdkit_targets
pubchem_present = target in pubchem_targets
row_cells[1].text = presence_mark(padel_present)
row_cells[2].text = str(padel_counts.get(target, 0)) if padel_present else '-'
row_cells[3].text = presence_mark(rdkit_present)
row_cells[4].text = str(rdkit_counts.get(target, 0)) if rdkit_present else '-'
row_cells[5].text = presence_mark(pubchem_present)
row_cells[6].text = str(pubchem_counts.get(target, 0)) if pubchem_present else '-'
# Optional: adjust font size for all cells
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.font.size = Pt(10)
# Save the document
doc.save('/Users/janat/Desktop/ppimic50pred/target_name_comparison.docx')
print("Saved comparison table as 'target_name_comparison.docx'")
Saved comparison table as 'target_name_comparison.docx'
Dataset: Training and Blindset for RDkit, PubChem, & PaDEL¶
In [23]:
def split_full_dataset(df, descriptor_name="DescriptorSet"):
"""
Splits the entire dataset (with all columns) into 80% training and 20% blind sets.
Parameters:
- df: pandas DataFrame to split
- descriptor_name: descriptor set name for logging
Returns:
- train_df: 80% of the original DataFrame
- blind_df: 20% of the original DataFrame
"""
train_df, blind_df = train_test_split(df, test_size=0.20, random_state=42)
print(f"📊 {descriptor_name} Split (All Columns Kept):")
print(f" Training Set: {train_df.shape}")
print(f" Blind Set: {blind_df.shape}\n")
return train_df, blind_df
In [25]:
# RDKit
rdkit_train, rdkit_blind = split_full_dataset(merged_df_rdkit, descriptor_name="RDKit")
# PubChem
pubchem_train, pubchem_blind = split_full_dataset(merged_df_pubchem, descriptor_name="PubChem")
# PaDEL
padel_train, padel_blind = split_full_dataset(merged_df_padel, descriptor_name="PaDEL")
📊 RDKit Split (All Columns Kept): Training Set: (2760, 223) Blind Set: (691, 223) 📊 PubChem Split (All Columns Kept): Training Set: (2755, 23) Blind Set: (689, 23) 📊 PaDEL Split (All Columns Kept): Training Set: (2340, 1881) Blind Set: (586, 1881)
RDKit Model Development¶
In [28]:
# Print all column names
print("🧾 RDKit Training Columns:")
print(rdkit_train.columns.tolist())
# Count total columns
total_columns = len(rdkit_train.columns)
print(f"\n🔢 Total columns: {total_columns}")
🧾 RDKit Training Columns: ['MaxAbsEStateIndex', 'MaxEStateIndex', 'MinAbsEStateIndex', 'MinEStateIndex', 'qed', 'SPS', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'AvgIpc', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA9', 'TPSA', 'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2', 'EState_VSA3', 'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'VSA_EState1', 'VSA_EState10', 'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7', 'VSA_EState8', 'VSA_EState9', 'FractionCSP3', 'HeavyAtomCount', 'NHOHCount', 'NOCount', 'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumAmideBonds', 'NumAromaticCarbocycles', 'NumAromaticHeterocycles', 'NumAromaticRings', 'NumAtomStereoCenters', 'NumBridgeheadAtoms', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumHeterocycles', 'NumRotatableBonds', 'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles', 'NumSaturatedRings', 'NumSpiroAtoms', 'NumUnspecifiedAtomStereoCenters', 'Phi', 'RingCount', 'MolLogP', 'MolMR', 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN', 'fr_Ar_COO', 'fr_Ar_N', 'fr_Ar_NH', 'fr_Ar_OH', 'fr_COO', 'fr_COO2', 'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_NH0', 'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH', 'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amide', 'fr_amidine', 'fr_aniline', 'fr_aryl_methyl', 'fr_azide', 'fr_azo', 'fr_barbitur', 'fr_benzene', 'fr_benzodiazepine', 'fr_bicyclic', 'fr_diazo', 'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_ether', 'fr_furan', 'fr_guanido', 'fr_halogen', 'fr_hdrzine', 'fr_hdrzone', 'fr_imidazole', 'fr_imide', 'fr_isocyan', 'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactam', 'fr_lactone', 'fr_methoxy', 'fr_morpholine', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom', 'fr_nitro_arom_nonortho', 'fr_nitroso', 'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 'fr_phenol', 'fr_phenol_noOrthoHbond', 'fr_phos_acid', 'fr_phos_ester', 'fr_piperdine', 'fr_piperzine', 'fr_priamide', 'fr_prisulfonamd', 'fr_pyridine', 'fr_quatN', 'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan', 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea', 'chembl_id', 'smiles', 'ic50_value', 'log_ic50', 'target_chembl_id', 'target_name'] 🔢 Total columns: 223
Feature Selection: Feature correlations (Unsupervised)¶
In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# ============================
# Step 1: Prepare the features
# ============================
y = rdkit_train['log_ic50']
drop_cols = ['chembl_id', 'smiles', 'ic50_value', 'log_ic50',
'target_chembl_id', 'target_name']
X = rdkit_train.drop(columns=[col for col in drop_cols if col in rdkit_train.columns])
# Replace blanks with NaN
X = X.replace(r'^\s*$', np.nan, regex=True)
# Convert to numeric (coerce errors to NaN)
for col in X.columns:
X[col] = pd.to_numeric(X[col], errors='coerce')
# Fill NaN with column means
X = X.fillna(X.mean())
print(f"Feature count after cleaning: {X.shape[1]}")
# Identify zero-variance columns
zero_var_cols = X.columns[X.var() == 0]
print(f"Zero variance columns count: {len(zero_var_cols)}")
print(f"Zero variance columns: {list(zero_var_cols)}")
# Remove zero-variance columns for plotting
X_plot = X.drop(columns=zero_var_cols)
print(f"Final features used for plotting: {X_plot.shape[1]}")
# =======================================
# Step 2: Identify significant features
# =======================================
corr_with_target = X_plot.corrwith(y)
significant_features = corr_with_target[abs(corr_with_target) >= 0.5] # threshold adjustable
significant_features = significant_features.sort_values(key=lambda x: abs(x), ascending=False)
print("\nSignificant features (|correlation| >= 0.5 with log_ic50):")
for feat, corr_val in significant_features.items():
print(f"{feat}: {corr_val:.3f}")
# =======================================
# Step 3: Compute correlation matrix
# =======================================
corr_matrix = X_plot.corr(min_periods=1)
print(f"Correlation matrix shape: {corr_matrix.shape}")
# Save full correlation matrix to CSV
corr_matrix.to_csv("/Users/janat/Desktop/ppimic50pred/feature_correlation_matrix_filtered.csv")
# ============================
# Step 4: Plot the heatmap (all ticks, small font)
# ============================
fig, ax = plt.subplots(figsize=(20, 18)) # Large figure to accommodate many labels
sns.heatmap(
corr_matrix,
cmap='coolwarm',
center=0,
annot=False,
fmt=".2f",
cbar_kws={'label': 'Correlation', 'shrink': 0.5},
square=True,
ax=ax
)
# Show all tick labels with very small font and rotated x labels
ax.set_xticks(np.arange(len(corr_matrix.columns)) + 0.5)
ax.set_xticklabels(corr_matrix.columns, rotation=90, fontsize=4.4, ha='center')
ax.set_yticks(np.arange(len(corr_matrix.index)) + 0.5)
ax.set_yticklabels(corr_matrix.index, rotation=0, fontsize=4.4)
ax.set_title("Feature Correlation Heatmap", fontsize=14)
plt.tight_layout()
# Save the heatmap to a high resolution file
plt.savefig("/Users/janat/Desktop/ppimic50pred/feature_correlation_heatmap_full_ticks.png", dpi=300, bbox_inches='tight')
plt.show()
Feature count after cleaning: 217 Zero variance columns count: 15 Zero variance columns: ['NumRadicalElectrons', 'SMR_VSA8', 'SlogP_VSA9', 'fr_aldehyde', 'fr_azide', 'fr_barbitur', 'fr_benzodiazepine', 'fr_diazo', 'fr_epoxide', 'fr_isocyan', 'fr_lactam', 'fr_nitroso', 'fr_prisulfonamd', 'fr_quatN', 'fr_thiocyan'] Final features used for plotting: 202 Significant features (|correlation| >= 0.5 with log_ic50): Correlation matrix shape: (202, 202)
In [32]:
corr_matrix
Out[32]:
| MaxAbsEStateIndex | MaxEStateIndex | MinAbsEStateIndex | MinEStateIndex | qed | SPS | MolWt | HeavyAtomMolWt | ExactMolWt | NumValenceElectrons | ... | fr_pyridine | fr_sulfide | fr_sulfonamd | fr_sulfone | fr_term_acetylene | fr_tetrazole | fr_thiazole | fr_thiophene | fr_unbrch_alkane | fr_urea | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| MaxAbsEStateIndex | 1.000000 | 1.000000 | -0.559066 | -0.338294 | -0.347496 | 0.239793 | 0.441315 | 0.445334 | 0.441088 | 0.433240 | ... | -0.011651 | -0.050506 | 0.113055 | 0.110933 | -0.058662 | 0.044134 | -0.050806 | -0.067882 | 0.011521 | 0.017483 |
| MaxEStateIndex | 1.000000 | 1.000000 | -0.559066 | -0.338294 | -0.347496 | 0.239793 | 0.441315 | 0.445334 | 0.441088 | 0.433240 | ... | -0.011651 | -0.050506 | 0.113055 | 0.110933 | -0.058662 | 0.044134 | -0.050806 | -0.067882 | 0.011521 | 0.017483 |
| MinAbsEStateIndex | -0.559066 | -0.559066 | 1.000000 | 0.207931 | 0.241266 | -0.129251 | -0.261840 | -0.265135 | -0.261725 | -0.261134 | ... | 0.008361 | -0.021724 | -0.102624 | -0.033824 | -0.003107 | -0.026917 | -0.011413 | 0.092516 | -0.037938 | 0.030824 |
| MinEStateIndex | -0.338294 | -0.338294 | 0.207931 | 1.000000 | 0.267902 | -0.012741 | -0.269907 | -0.279977 | -0.270075 | -0.250547 | ... | -0.001183 | -0.009443 | -0.590127 | -0.306641 | 0.028063 | -0.015371 | 0.052839 | -0.194450 | 0.079989 | 0.111741 |
| qed | -0.347496 | -0.347496 | 0.241266 | 0.267902 | 1.000000 | 0.012332 | -0.693489 | -0.697765 | -0.693409 | -0.672402 | ... | 0.013057 | -0.158715 | -0.238426 | 0.017479 | 0.003872 | -0.051802 | -0.002293 | 0.156400 | -0.256958 | -0.068763 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| fr_tetrazole | 0.044134 | 0.044134 | -0.026917 | -0.015371 | -0.051802 | -0.034290 | 0.120868 | 0.122782 | 0.120938 | 0.121677 | ... | -0.035813 | -0.030585 | 0.077976 | -0.024088 | -0.002991 | 1.000000 | -0.018884 | -0.024672 | 0.035246 | -0.018153 |
| fr_thiazole | -0.050806 | -0.050806 | -0.011413 | 0.052839 | -0.002293 | -0.098514 | -0.040058 | -0.037312 | -0.039910 | -0.045286 | ... | 0.029065 | -0.035072 | -0.059431 | 0.016223 | -0.006871 | -0.018884 | 1.000000 | 0.004567 | -0.025707 | -0.024296 |
| fr_thiophene | -0.067882 | -0.067882 | 0.092516 | -0.194450 | 0.156400 | -0.001544 | -0.076748 | -0.074920 | -0.076423 | -0.074294 | ... | -0.001997 | -0.019047 | -0.099957 | 0.055068 | -0.008977 | -0.024672 | 0.004567 | 1.000000 | -0.062906 | -0.054491 |
| fr_unbrch_alkane | 0.011521 | 0.011521 | -0.037938 | 0.079989 | -0.256958 | -0.103642 | 0.246375 | 0.240140 | 0.246496 | 0.251987 | ... | -0.075828 | 0.216716 | -0.031852 | -0.035772 | -0.007849 | 0.035246 | -0.025707 | -0.062906 | 1.000000 | 0.073036 |
| fr_urea | 0.017483 | 0.017483 | 0.030824 | 0.111741 | -0.068763 | -0.126248 | 0.062470 | 0.060604 | 0.062264 | 0.055286 | ... | -0.008721 | -0.009759 | -0.081612 | -0.053202 | 0.023037 | -0.018153 | -0.024296 | -0.054491 | 0.073036 | 1.000000 |
202 rows × 202 columns
Tunning Parameters of Random Forest using 10-fold Cross Validation¶
In [36]:
import numpy as np
import pandas as pd
import warnings
import re
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
# Suppress specific RuntimeWarnings
warnings.filterwarnings("ignore", message=re.escape("overflow encountered in cast"), category=RuntimeWarning)
# Define custom RMSE scorer
def rmse(y_true, y_pred):
return np.sqrt(mean_squared_error(y_true, y_pred))
neg_rmse_scorer = make_scorer(rmse, greater_is_better=False)
# Prepare the data
y = rdkit_train['log_ic50']
drop_cols = ['chembl_id', 'smiles', 'ic50_value', 'log_ic50', 'target_chembl_id', 'target_name']
X = rdkit_train.drop(columns=[col for col in drop_cols if col in rdkit_train.columns])
X = X.replace(r'^\s*$', np.nan, regex=True)
X = X.apply(pd.to_numeric, errors='coerce')
X = X.replace([np.inf, -np.inf], np.nan)
X = X.clip(upper=1e6, lower=-1e6)
X = X.fillna(X.mean())
# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Define model and hyperparameter grid
rf = RandomForestRegressor(random_state=42)
rf_param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [10, 15, 20, None],
'min_samples_split': [2, 5],
'min_samples_leaf': [1, 2],
'max_features': ['sqrt', 'log2', None]
}
# Grid Search
rf_grid_search = GridSearchCV(
estimator=rf,
param_grid=rf_param_grid,
scoring=neg_rmse_scorer,
cv=10,
n_jobs=-1,
verbose=2
)
rf_grid_search.fit(X_train, y_train)
# Store best parameters
rf_best_params = rf_grid_search.best_params_
# Results
print("Random Forest Best Parameters:")
print(rf_best_params)
# Print best score (convert from negative RMSE to positive RMSE)
best_neg_rmse = rf_grid_search.best_score_
best_rmse = -best_neg_rmse # since we used negative RMSE as scorer
print(f"Best RMSE Score (10-fold CV): {best_rmse:.4f}")
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Random Forest Best Parameters:
{'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best RMSE Score (10-fold CV): 0.7954
Tunning Parameters of LSTM deep learning¶
In [38]:
import itertools
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", message="overflow encountered*", category=RuntimeWarning)
# Define LSTM model
class LSTMRegressor(nn.Module):
def __init__(self, input_size, hidden_size=64, num_layers=1, activation_fn=nn.ReLU()):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, 1)
self.activation = activation_fn
def forward(self, x):
out, _ = self.lstm(x)
out = out[:, -1, :]
out = self.fc(out)
out = self.activation(out)
return out
# Training function
def train_model(model, train_loader, val_loader, epochs=50, patience=7, lr=0.001):
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
best_val_loss = float('inf')
patience_counter = 0
best_state = None
for epoch in range(epochs):
model.train()
for xb, yb in train_loader:
optimizer.zero_grad()
preds = model(xb)
loss = criterion(preds, yb)
loss.backward()
optimizer.step()
model.eval()
val_losses = []
with torch.no_grad():
for xb, yb in val_loader:
preds = model(xb)
loss = criterion(preds, yb)
val_losses.append(loss.item())
avg_val_loss = np.mean(val_losses)
if avg_val_loss < best_val_loss:
best_val_loss = avg_val_loss
best_state = model.state_dict()
patience_counter = 0
else:
patience_counter += 1
if patience_counter >= patience:
break
model.load_state_dict(best_state)
return model, best_val_loss
# Prepare data
y = rdkit_train['log_ic50']
drop_cols = ['chembl_id', 'smiles', 'ic50_value', 'log_ic50', 'target_chembl_id', 'target_name']
X = rdkit_train.drop(columns=[col for col in drop_cols if col in rdkit_train.columns])
X = X.replace(r'^\s*$', np.nan, regex=True)
X = X.apply(pd.to_numeric, errors='coerce')
X = X.fillna(X.mean())
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_train_seq = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_val_seq = X_val_scaled.reshape(X_val_scaled.shape[0], 1, X_val_scaled.shape[1])
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values.reshape(-1, 1), dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_seq, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values.reshape(-1, 1), dtype=torch.float32)
train_ds = TensorDataset(X_train_tensor, y_train_tensor)
val_ds = TensorDataset(X_val_tensor, y_val_tensor)
# Hyperparameter search
activation_functions = {
'relu': nn.ReLU(),
'tanh': nn.Tanh(),
'sigmoid': nn.Sigmoid(),
'leaky_relu': nn.LeakyReLU()
}
lstm_param_grid = {
'hidden_size': [32, 64, 128],
'num_layers': [1, 2],
'learning_rate': [1e-3, 5e-4],
'batch_size': [32, 64],
'activation': list(activation_functions.keys())
}
lstm_results = []
for hidden_size, num_layers, lr, batch_size, act_name in itertools.product(
lstm_param_grid['hidden_size'],
lstm_param_grid['num_layers'],
lstm_param_grid['learning_rate'],
lstm_param_grid['batch_size'],
lstm_param_grid['activation']
):
print(f"\nTraining: hidden_size={hidden_size}, num_layers={num_layers}, lr={lr}, "
f"batch_size={batch_size}, activation={act_name}")
activation_fn = activation_functions[act_name]
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size)
model = LSTMRegressor(
input_size=X_train_tensor.shape[2],
hidden_size=hidden_size,
num_layers=num_layers,
activation_fn=activation_fn
)
model, val_loss = train_model(model, train_loader, val_loader, epochs=50, patience=7, lr=lr)
model.eval()
val_preds, val_true = [], []
with torch.no_grad():
for xb, yb in val_loader:
preds = model(xb)
val_preds.append(preds.cpu().numpy())
val_true.append(yb.cpu().numpy())
val_preds = np.vstack(val_preds).flatten()
val_true = np.vstack(val_true).flatten()
mse = mean_squared_error(val_true, val_preds)
rmse = np.sqrt(mse)
r2 = r2_score(val_true, val_preds)
print(f"→ RMSE={rmse:.4f}, R²={r2:.4f}")
lstm_results.append({
'hidden_size': hidden_size,
'num_layers': num_layers,
'learning_rate': lr,
'batch_size': batch_size,
'activation': act_name,
'RMSE': rmse,
'R2': r2
})
# Store best parameters
results_df = pd.DataFrame(lstm_results)
results_df_sorted = results_df.sort_values(by='RMSE', ascending=True)
print("\n=== LSTM Grid Search Results (sorted by RMSE) ===")
print(results_df_sorted.to_string(index=False, float_format="%.4f"))
best_row = results_df_sorted.iloc[0]
lstm_best_params = {
'hidden_size': int(best_row['hidden_size']),
'num_layers': int(best_row['num_layers']),
'learning_rate': float(best_row['learning_rate']),
'batch_size': int(best_row['batch_size']),
'activation': best_row['activation']
}
print("\n=== Best LSTM Hyperparameters ===")
print(lstm_best_params)
Training: hidden_size=32, num_layers=1, lr=0.001, batch_size=32, activation=relu
→ RMSE=0.8099, R²=0.7308
Training: hidden_size=32, num_layers=1, lr=0.001, batch_size=32, activation=tanh
→ RMSE=1.9888, R²=-0.6233
Training: hidden_size=32, num_layers=1, lr=0.001, batch_size=32, activation=sigmoid
→ RMSE=1.9940, R²=-0.6317
Training: hidden_size=32, num_layers=1, lr=0.001, batch_size=32, activation=leaky_relu
→ RMSE=0.8322, R²=0.7157
Training: hidden_size=32, num_layers=1, lr=0.001, batch_size=64, activation=relu
→ RMSE=0.8045, R²=0.7344
Training: hidden_size=32, num_layers=1, lr=0.001, batch_size=64, activation=tanh
→ RMSE=1.9923, R²=-0.6289
Training: hidden_size=32, num_layers=1, lr=0.001, batch_size=64, activation=sigmoid
→ RMSE=1.9936, R²=-0.6312
Training: hidden_size=32, num_layers=1, lr=0.001, batch_size=64, activation=leaky_relu
→ RMSE=0.8254, R²=0.7204
Training: hidden_size=32, num_layers=1, lr=0.0005, batch_size=32, activation=relu
→ RMSE=0.8106, R²=0.7304
Training: hidden_size=32, num_layers=1, lr=0.0005, batch_size=32, activation=tanh
→ RMSE=1.9910, R²=-0.6269
Training: hidden_size=32, num_layers=1, lr=0.0005, batch_size=32, activation=sigmoid
→ RMSE=1.9941, R²=-0.6320
Training: hidden_size=32, num_layers=1, lr=0.0005, batch_size=32, activation=leaky_relu
→ RMSE=0.8186, R²=0.7250
Training: hidden_size=32, num_layers=1, lr=0.0005, batch_size=64, activation=relu
→ RMSE=0.7976, R²=0.7389
Training: hidden_size=32, num_layers=1, lr=0.0005, batch_size=64, activation=tanh
→ RMSE=1.9924, R²=-0.6291
Training: hidden_size=32, num_layers=1, lr=0.0005, batch_size=64, activation=sigmoid
→ RMSE=1.9958, R²=-0.6348
Training: hidden_size=32, num_layers=1, lr=0.0005, batch_size=64, activation=leaky_relu
→ RMSE=0.8350, R²=0.7139
Training: hidden_size=32, num_layers=2, lr=0.001, batch_size=32, activation=relu
→ RMSE=2.7765, R²=-2.1637
Training: hidden_size=32, num_layers=2, lr=0.001, batch_size=32, activation=tanh
→ RMSE=1.9945, R²=-0.6326
Training: hidden_size=32, num_layers=2, lr=0.001, batch_size=32, activation=sigmoid
→ RMSE=1.9928, R²=-0.6298
Training: hidden_size=32, num_layers=2, lr=0.001, batch_size=32, activation=leaky_relu
→ RMSE=0.8722, R²=0.6878
Training: hidden_size=32, num_layers=2, lr=0.001, batch_size=64, activation=relu
→ RMSE=2.7765, R²=-2.1637
Training: hidden_size=32, num_layers=2, lr=0.001, batch_size=64, activation=tanh
→ RMSE=1.9928, R²=-0.6298
Training: hidden_size=32, num_layers=2, lr=0.001, batch_size=64, activation=sigmoid
→ RMSE=1.9931, R²=-0.6302
Training: hidden_size=32, num_layers=2, lr=0.001, batch_size=64, activation=leaky_relu
→ RMSE=0.8107, R²=0.7302
Training: hidden_size=32, num_layers=2, lr=0.0005, batch_size=32, activation=relu
→ RMSE=2.7765, R²=-2.1637
Training: hidden_size=32, num_layers=2, lr=0.0005, batch_size=32, activation=tanh
→ RMSE=1.9955, R²=-0.6342
Training: hidden_size=32, num_layers=2, lr=0.0005, batch_size=32, activation=sigmoid
→ RMSE=1.9930, R²=-0.6301
Training: hidden_size=32, num_layers=2, lr=0.0005, batch_size=32, activation=leaky_relu
→ RMSE=0.8125, R²=0.7291
Training: hidden_size=32, num_layers=2, lr=0.0005, batch_size=64, activation=relu
→ RMSE=2.7765, R²=-2.1637
Training: hidden_size=32, num_layers=2, lr=0.0005, batch_size=64, activation=tanh
→ RMSE=1.9905, R²=-0.6261
Training: hidden_size=32, num_layers=2, lr=0.0005, batch_size=64, activation=sigmoid
→ RMSE=1.9928, R²=-0.6298
Training: hidden_size=32, num_layers=2, lr=0.0005, batch_size=64, activation=leaky_relu
→ RMSE=0.8276, R²=0.7189
Training: hidden_size=64, num_layers=1, lr=0.001, batch_size=32, activation=relu
→ RMSE=0.8023, R²=0.7359
Training: hidden_size=64, num_layers=1, lr=0.001, batch_size=32, activation=tanh
→ RMSE=1.9919, R²=-0.6283
Training: hidden_size=64, num_layers=1, lr=0.001, batch_size=32, activation=sigmoid
→ RMSE=1.9920, R²=-0.6286
Training: hidden_size=64, num_layers=1, lr=0.001, batch_size=32, activation=leaky_relu
→ RMSE=0.8029, R²=0.7354
Training: hidden_size=64, num_layers=1, lr=0.001, batch_size=64, activation=relu
→ RMSE=0.7974, R²=0.7391
Training: hidden_size=64, num_layers=1, lr=0.001, batch_size=64, activation=tanh
→ RMSE=1.9891, R²=-0.6238
Training: hidden_size=64, num_layers=1, lr=0.001, batch_size=64, activation=sigmoid
→ RMSE=1.9926, R²=-0.6294
Training: hidden_size=64, num_layers=1, lr=0.001, batch_size=64, activation=leaky_relu
→ RMSE=0.8063, R²=0.7332
Training: hidden_size=64, num_layers=1, lr=0.0005, batch_size=32, activation=relu
→ RMSE=0.7899, R²=0.7439
Training: hidden_size=64, num_layers=1, lr=0.0005, batch_size=32, activation=tanh
→ RMSE=1.9923, R²=-0.6289
Training: hidden_size=64, num_layers=1, lr=0.0005, batch_size=32, activation=sigmoid
→ RMSE=1.9916, R²=-0.6278
Training: hidden_size=64, num_layers=1, lr=0.0005, batch_size=32, activation=leaky_relu
→ RMSE=0.8040, R²=0.7347
Training: hidden_size=64, num_layers=1, lr=0.0005, batch_size=64, activation=relu
→ RMSE=0.8047, R²=0.7342
Training: hidden_size=64, num_layers=1, lr=0.0005, batch_size=64, activation=tanh
→ RMSE=1.9894, R²=-0.6242
Training: hidden_size=64, num_layers=1, lr=0.0005, batch_size=64, activation=sigmoid
→ RMSE=1.9923, R²=-0.6289
Training: hidden_size=64, num_layers=1, lr=0.0005, batch_size=64, activation=leaky_relu
→ RMSE=0.7900, R²=0.7439
Training: hidden_size=64, num_layers=2, lr=0.001, batch_size=32, activation=relu
→ RMSE=0.8248, R²=0.7208
Training: hidden_size=64, num_layers=2, lr=0.001, batch_size=32, activation=tanh
→ RMSE=1.9957, R²=-0.6346
Training: hidden_size=64, num_layers=2, lr=0.001, batch_size=32, activation=sigmoid
→ RMSE=1.9963, R²=-0.6356
Training: hidden_size=64, num_layers=2, lr=0.001, batch_size=32, activation=leaky_relu
→ RMSE=0.8276, R²=0.7189
Training: hidden_size=64, num_layers=2, lr=0.001, batch_size=64, activation=relu
→ RMSE=0.8266, R²=0.7196
Training: hidden_size=64, num_layers=2, lr=0.001, batch_size=64, activation=tanh
→ RMSE=1.9955, R²=-0.6342
Training: hidden_size=64, num_layers=2, lr=0.001, batch_size=64, activation=sigmoid
→ RMSE=1.9947, R²=-0.6328
Training: hidden_size=64, num_layers=2, lr=0.001, batch_size=64, activation=leaky_relu
→ RMSE=0.8224, R²=0.7224
Training: hidden_size=64, num_layers=2, lr=0.0005, batch_size=32, activation=relu
→ RMSE=0.7944, R²=0.7410
Training: hidden_size=64, num_layers=2, lr=0.0005, batch_size=32, activation=tanh
→ RMSE=1.9952, R²=-0.6337
Training: hidden_size=64, num_layers=2, lr=0.0005, batch_size=32, activation=sigmoid
→ RMSE=1.9954, R²=-0.6340
Training: hidden_size=64, num_layers=2, lr=0.0005, batch_size=32, activation=leaky_relu
→ RMSE=0.7887, R²=0.7447
Training: hidden_size=64, num_layers=2, lr=0.0005, batch_size=64, activation=relu
→ RMSE=0.8054, R²=0.7338
Training: hidden_size=64, num_layers=2, lr=0.0005, batch_size=64, activation=tanh
→ RMSE=1.9924, R²=-0.6292
Training: hidden_size=64, num_layers=2, lr=0.0005, batch_size=64, activation=sigmoid
→ RMSE=1.9935, R²=-0.6309
Training: hidden_size=64, num_layers=2, lr=0.0005, batch_size=64, activation=leaky_relu
→ RMSE=0.8073, R²=0.7325
Training: hidden_size=128, num_layers=1, lr=0.001, batch_size=32, activation=relu
→ RMSE=0.7934, R²=0.7416
Training: hidden_size=128, num_layers=1, lr=0.001, batch_size=32, activation=tanh
→ RMSE=1.9909, R²=-0.6267
Training: hidden_size=128, num_layers=1, lr=0.001, batch_size=32, activation=sigmoid
→ RMSE=1.9948, R²=-0.6331
Training: hidden_size=128, num_layers=1, lr=0.001, batch_size=32, activation=leaky_relu
→ RMSE=0.7816, R²=0.7493
Training: hidden_size=128, num_layers=1, lr=0.001, batch_size=64, activation=relu
→ RMSE=0.8122, R²=0.7293
Training: hidden_size=128, num_layers=1, lr=0.001, batch_size=64, activation=tanh
→ RMSE=1.9911, R²=-0.6270
Training: hidden_size=128, num_layers=1, lr=0.001, batch_size=64, activation=sigmoid
→ RMSE=1.9929, R²=-0.6299
Training: hidden_size=128, num_layers=1, lr=0.001, batch_size=64, activation=leaky_relu
→ RMSE=0.7982, R²=0.7385
Training: hidden_size=128, num_layers=1, lr=0.0005, batch_size=32, activation=relu
→ RMSE=0.7900, R²=0.7439
Training: hidden_size=128, num_layers=1, lr=0.0005, batch_size=32, activation=tanh
→ RMSE=1.9906, R²=-0.6262
Training: hidden_size=128, num_layers=1, lr=0.0005, batch_size=32, activation=sigmoid
→ RMSE=1.9925, R²=-0.6293
Training: hidden_size=128, num_layers=1, lr=0.0005, batch_size=32, activation=leaky_relu
→ RMSE=0.7849, R²=0.7472
Training: hidden_size=128, num_layers=1, lr=0.0005, batch_size=64, activation=relu
→ RMSE=0.7926, R²=0.7422
Training: hidden_size=128, num_layers=1, lr=0.0005, batch_size=64, activation=tanh
→ RMSE=1.9903, R²=-0.6258
Training: hidden_size=128, num_layers=1, lr=0.0005, batch_size=64, activation=sigmoid
→ RMSE=1.9938, R²=-0.6314
Training: hidden_size=128, num_layers=1, lr=0.0005, batch_size=64, activation=leaky_relu
→ RMSE=0.7795, R²=0.7507
Training: hidden_size=128, num_layers=2, lr=0.001, batch_size=32, activation=relu
→ RMSE=0.8174, R²=0.7258
Training: hidden_size=128, num_layers=2, lr=0.001, batch_size=32, activation=tanh
→ RMSE=1.9947, R²=-0.6329
Training: hidden_size=128, num_layers=2, lr=0.001, batch_size=32, activation=sigmoid
→ RMSE=1.9960, R²=-0.6350
Training: hidden_size=128, num_layers=2, lr=0.001, batch_size=32, activation=leaky_relu
→ RMSE=0.8343, R²=0.7144
Training: hidden_size=128, num_layers=2, lr=0.001, batch_size=64, activation=relu
→ RMSE=0.8070, R²=0.7327
Training: hidden_size=128, num_layers=2, lr=0.001, batch_size=64, activation=tanh
→ RMSE=1.9986, R²=-0.6393
Training: hidden_size=128, num_layers=2, lr=0.001, batch_size=64, activation=sigmoid
→ RMSE=1.9971, R²=-0.6368
Training: hidden_size=128, num_layers=2, lr=0.001, batch_size=64, activation=leaky_relu
→ RMSE=0.8015, R²=0.7364
Training: hidden_size=128, num_layers=2, lr=0.0005, batch_size=32, activation=relu
→ RMSE=0.8121, R²=0.7293
Training: hidden_size=128, num_layers=2, lr=0.0005, batch_size=32, activation=tanh
→ RMSE=1.9935, R²=-0.6310
Training: hidden_size=128, num_layers=2, lr=0.0005, batch_size=32, activation=sigmoid
→ RMSE=1.9947, R²=-0.6328
Training: hidden_size=128, num_layers=2, lr=0.0005, batch_size=32, activation=leaky_relu
→ RMSE=0.7870, R²=0.7458
Training: hidden_size=128, num_layers=2, lr=0.0005, batch_size=64, activation=relu
→ RMSE=0.8135, R²=0.7284
Training: hidden_size=128, num_layers=2, lr=0.0005, batch_size=64, activation=tanh
→ RMSE=1.9946, R²=-0.6327
Training: hidden_size=128, num_layers=2, lr=0.0005, batch_size=64, activation=sigmoid
→ RMSE=1.9959, R²=-0.6349
Training: hidden_size=128, num_layers=2, lr=0.0005, batch_size=64, activation=leaky_relu
→ RMSE=0.8092, R²=0.7313
=== LSTM Grid Search Results (sorted by RMSE) ===
hidden_size num_layers learning_rate batch_size activation RMSE R2
128 1 0.0005 64 leaky_relu 0.7795 0.7507
128 1 0.0010 32 leaky_relu 0.7816 0.7493
128 1 0.0005 32 leaky_relu 0.7849 0.7472
128 2 0.0005 32 leaky_relu 0.7870 0.7458
64 2 0.0005 32 leaky_relu 0.7887 0.7447
64 1 0.0005 32 relu 0.7899 0.7439
64 1 0.0005 64 leaky_relu 0.7900 0.7439
128 1 0.0005 32 relu 0.7900 0.7439
128 1 0.0005 64 relu 0.7926 0.7422
128 1 0.0010 32 relu 0.7934 0.7416
64 2 0.0005 32 relu 0.7944 0.7410
64 1 0.0010 64 relu 0.7974 0.7391
32 1 0.0005 64 relu 0.7976 0.7389
128 1 0.0010 64 leaky_relu 0.7982 0.7385
128 2 0.0010 64 leaky_relu 0.8015 0.7364
64 1 0.0010 32 relu 0.8023 0.7359
64 1 0.0010 32 leaky_relu 0.8029 0.7354
64 1 0.0005 32 leaky_relu 0.8040 0.7347
32 1 0.0010 64 relu 0.8045 0.7344
64 1 0.0005 64 relu 0.8047 0.7342
64 2 0.0005 64 relu 0.8054 0.7338
64 1 0.0010 64 leaky_relu 0.8063 0.7332
128 2 0.0010 64 relu 0.8070 0.7327
64 2 0.0005 64 leaky_relu 0.8073 0.7325
128 2 0.0005 64 leaky_relu 0.8092 0.7313
32 1 0.0010 32 relu 0.8099 0.7308
32 1 0.0005 32 relu 0.8106 0.7304
32 2 0.0010 64 leaky_relu 0.8107 0.7302
128 2 0.0005 32 relu 0.8121 0.7293
128 1 0.0010 64 relu 0.8122 0.7293
32 2 0.0005 32 leaky_relu 0.8125 0.7291
128 2 0.0005 64 relu 0.8135 0.7284
128 2 0.0010 32 relu 0.8174 0.7258
32 1 0.0005 32 leaky_relu 0.8186 0.7250
64 2 0.0010 64 leaky_relu 0.8224 0.7224
64 2 0.0010 32 relu 0.8248 0.7208
32 1 0.0010 64 leaky_relu 0.8254 0.7204
64 2 0.0010 64 relu 0.8266 0.7196
32 2 0.0005 64 leaky_relu 0.8276 0.7189
64 2 0.0010 32 leaky_relu 0.8276 0.7189
32 1 0.0010 32 leaky_relu 0.8322 0.7157
128 2 0.0010 32 leaky_relu 0.8343 0.7144
32 1 0.0005 64 leaky_relu 0.8350 0.7139
32 2 0.0010 32 leaky_relu 0.8722 0.6878
32 1 0.0010 32 tanh 1.9888 -0.6233
64 1 0.0010 64 tanh 1.9891 -0.6238
64 1 0.0005 64 tanh 1.9894 -0.6242
128 1 0.0005 64 tanh 1.9903 -0.6258
32 2 0.0005 64 tanh 1.9905 -0.6261
128 1 0.0005 32 tanh 1.9906 -0.6262
128 1 0.0010 32 tanh 1.9909 -0.6267
32 1 0.0005 32 tanh 1.9910 -0.6269
128 1 0.0010 64 tanh 1.9911 -0.6270
64 1 0.0005 32 sigmoid 1.9916 -0.6278
64 1 0.0010 32 tanh 1.9919 -0.6283
64 1 0.0010 32 sigmoid 1.9920 -0.6286
64 1 0.0005 32 tanh 1.9923 -0.6289
64 1 0.0005 64 sigmoid 1.9923 -0.6289
32 1 0.0010 64 tanh 1.9923 -0.6289
32 1 0.0005 64 tanh 1.9924 -0.6291
64 2 0.0005 64 tanh 1.9924 -0.6292
128 1 0.0005 32 sigmoid 1.9925 -0.6293
64 1 0.0010 64 sigmoid 1.9926 -0.6294
32 2 0.0010 32 sigmoid 1.9928 -0.6298
32 2 0.0010 64 tanh 1.9928 -0.6298
32 2 0.0005 64 sigmoid 1.9928 -0.6298
128 1 0.0010 64 sigmoid 1.9929 -0.6299
32 2 0.0005 32 sigmoid 1.9930 -0.6301
32 2 0.0010 64 sigmoid 1.9931 -0.6302
64 2 0.0005 64 sigmoid 1.9935 -0.6309
128 2 0.0005 32 tanh 1.9935 -0.6310
32 1 0.0010 64 sigmoid 1.9936 -0.6312
128 1 0.0005 64 sigmoid 1.9938 -0.6314
32 1 0.0010 32 sigmoid 1.9940 -0.6317
32 1 0.0005 32 sigmoid 1.9941 -0.6320
32 2 0.0010 32 tanh 1.9945 -0.6326
128 2 0.0005 64 tanh 1.9946 -0.6327
64 2 0.0010 64 sigmoid 1.9947 -0.6328
128 2 0.0005 32 sigmoid 1.9947 -0.6328
128 2 0.0010 32 tanh 1.9947 -0.6329
128 1 0.0010 32 sigmoid 1.9948 -0.6331
64 2 0.0005 32 tanh 1.9952 -0.6337
64 2 0.0005 32 sigmoid 1.9954 -0.6340
32 2 0.0005 32 tanh 1.9955 -0.6342
64 2 0.0010 64 tanh 1.9955 -0.6342
64 2 0.0010 32 tanh 1.9957 -0.6346
32 1 0.0005 64 sigmoid 1.9958 -0.6348
128 2 0.0005 64 sigmoid 1.9959 -0.6349
128 2 0.0010 32 sigmoid 1.9960 -0.6350
64 2 0.0010 32 sigmoid 1.9963 -0.6356
128 2 0.0010 64 sigmoid 1.9971 -0.6368
128 2 0.0010 64 tanh 1.9986 -0.6393
32 2 0.0005 32 relu 2.7765 -2.1637
32 2 0.0010 32 relu 2.7765 -2.1637
32 2 0.0010 64 relu 2.7765 -2.1637
32 2 0.0005 64 relu 2.7765 -2.1637
=== Best LSTM Hyperparameters ===
{'hidden_size': 128, 'num_layers': 1, 'learning_rate': 0.0005, 'batch_size': 64, 'activation': 'leaky_relu'}
Feature Selection: Feature Importance (Supervised): RandomForestRegressor¶
In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import warnings
import re
# Suppress specific RuntimeWarnings related to overflow in casting
warnings.filterwarnings(
"ignore",
message=re.escape("overflow encountered in cast"),
category=RuntimeWarning,
)
# Prepare target variable
y = rdkit_train['log_ic50']
# Define columns to drop if present
drop_cols = [
'chembl_id', 'smiles', 'ic50_value', 'log_ic50', 'target_chembl_id', 'target_name'
]
# Drop unwanted columns safely
X = rdkit_train.drop(columns=[col for col in drop_cols if col in rdkit_train.columns])
# Clean and convert feature data to numeric
X = X.replace(r'^\s*$', np.nan, regex=True) # Replace empty strings with NaN
X = X.apply(pd.to_numeric, errors='coerce') # Coerce non-numeric to NaN
X = X.replace([np.inf, -np.inf], np.nan) # Replace infinite values with NaN
X = X.clip(lower=-1e6, upper=1e6) # Clip extreme values
X = X.fillna(X.mean()) # Impute NaN with column means
# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Initialize and train Random Forest Regressor with given best params
reg = RandomForestRegressor(**rf_best_params)
reg.fit(X_train, y_train)
# Extract feature importances
feature_importance = reg.feature_importances_
# Filter features with importance > 0
important_mask = feature_importance > 0.0
features_filtered = X.columns[important_mask]
importances_filtered = feature_importance[important_mask]
# Sort features by importance in descending order
sorted_idx = np.argsort(importances_filtered)[::-1]
features_sorted = features_filtered[sorted_idx]
importances_sorted = importances_filtered[sorted_idx]
# Plot feature importances
fig, ax = plt.subplots(figsize=(18, 8))
ax.bar(features_sorted, importances_sorted)
# Customize plot appearance
ax.set_xlabel("Features", fontsize=10)
ax.set_ylabel("Importance", fontsize=12)
ax.set_title("Feature Importance (Filtered)", fontsize=16)
plt.xticks(rotation=90, fontsize=8)
plt.yticks(fontsize=8)
ax.set_ylim(bottom=0)
ax.set_xlim(left=-0.5, right=len(features_sorted) - 0.5)
# Hide top and right plot borders
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.tight_layout()
# Save the figure
plot_path = "/Users/janat/Desktop/ppimic50pred/feature_importance_filtered.png"
plt.savefig(plot_path, dpi=300)
print(f"Feature importance plot saved to: {plot_path}")
plt.show()
print(f"Number of filtered features: {features_filtered.shape[0]}")
Feature importance plot saved to: /Users/janat/Desktop/ppimic50pred/feature_importance_filtered.png
Number of filtered features: 195
Feature Importance & Feature Correlation combined¶
In [51]:
filtered_set = set(features_filtered.tolist())
full_set = set(corr_matrix.columns.tolist())
# Are all filtered features present in full feature list?
is_subset = filtered_set.issubset(full_set)
print(f"Filtered features are subset of correlation matrix features? {is_subset}")
# Check how many features differ
missing_features = full_set - filtered_set
extra_features = filtered_set - full_set
print(f"Features missing from filtered set: {missing_features}")
print(f"Features extra in filtered set (should be empty): {extra_features}")
# Confirm counts
print(f"Number of filtered features: {len(filtered_set)}")
print(f"Number of full features: {len(full_set)}")
Filtered features are subset of correlation matrix features? True
Features missing from filtered set: {'fr_term_acetylene', 'fr_phos_ester', 'fr_oxime', 'fr_dihydropyridine', 'fr_phos_acid', 'fr_hdrzine', 'fr_N_O'}
Features extra in filtered set (should be empty): set()
Number of filtered features: 195
Number of full features: 202
K fold Cross validation Model Development with blind set Validation¶
In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, random_split
# === Metrics ===
def medape(y_true, y_pred, epsilon=1e-3):
y_true, y_pred = np.array(y_true), np.array(y_pred)
mask = y_true > epsilon
if np.sum(mask) == 0:
return np.nan
return np.median(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100
def smape(y_true, y_pred, epsilon=1e-3):
y_true, y_pred = np.array(y_true), np.array(y_pred)
denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
denominator = np.where(denominator < epsilon, epsilon, denominator)
return np.mean(np.abs(y_true - y_pred) / denominator) * 100
# === LSTM model ===
class LSTMRegressor(nn.Module):
def __init__(self, input_size, hidden_size=64, num_layers=2, activation='relu'):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
activations = {
'relu': nn.ReLU(),
'tanh': nn.Tanh(),
'sigmoid': nn.Sigmoid(),
'leaky_relu': nn.LeakyReLU()
}
self.activation = activations.get(activation, None)
self.fc = nn.Linear(hidden_size, 1)
def forward(self, x):
out, _ = self.lstm(x)
out = out[:, -1, :] # take last time step
out = self.activation(out) if self.activation else out
return self.fc(out)
def train_model(model, train_loader, val_loader, epochs=200, patience=20, lr=0.001):
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
best_loss = float('inf')
best_state = None
patience_counter = 0
for epoch in range(epochs):
model.train()
for xb, yb in train_loader:
optimizer.zero_grad()
loss = criterion(model(xb), yb)
loss.backward()
optimizer.step()
model.eval()
val_losses = []
with torch.no_grad():
for xb, yb in val_loader:
val_loss = criterion(model(xb), yb)
val_losses.append(val_loss.item())
avg_val_loss = np.mean(val_losses)
if avg_val_loss < best_loss:
best_loss = avg_val_loss
best_state = model.state_dict()
patience_counter = 0
else:
patience_counter += 1
if patience_counter >= patience:
break
model.load_state_dict(best_state)
return model
# Prepare training data
y = rdkit_train['log_ic50']
X = rdkit_train.drop(columns=[col for col in drop_cols if col in rdkit_train.columns])
X = X.replace(r'^\s*$', np.nan, regex=True).apply(pd.to_numeric, errors='coerce').fillna(X.mean())
X_selected = X[features_filtered]
print(f"Training features shape: {X_selected.shape}")
# Prepare blind test data
X_blind = rdkit_blind.drop(columns=[col for col in drop_cols if col in rdkit_blind.columns])
X_blind = X_blind.replace(r'^\s*$', np.nan, regex=True).apply(pd.to_numeric, errors='coerce').fillna(X_blind.mean())
X_blind_selected = X_blind[features_filtered]
print(f"Blind features shape: {X_blind_selected.shape}")
# Define models
models = {
'Random Forest': RandomForestRegressor(**rf_best_params),
'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, random_state=42),
'SVM': SVR(),
'LSTM (PyTorch)': None # Placeholder, instantiate later per fold
}
metrics = ['RMSE', 'MSE', 'R²', 'MedAPE', 'SMAPE']
# Cross-validation loop
for k in range(2, 11):
print(f"\n--- {k}-Fold Cross Validation ---")
kf = KFold(n_splits=k, shuffle=True, random_state=42)
scores = {metric: {m: [] for m in models} for metric in metrics}
for train_idx, test_idx in kf.split(X_selected):
# Use .iloc for row indexing
X_train, X_test = X_selected.iloc[train_idx], X_selected.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train classic models
for name, model in models.items():
if name == 'LSTM (PyTorch)':
continue
model.fit(X_train_scaled, y_train)
y_pred = np.maximum(model.predict(X_test_scaled), 0)
scores['RMSE'][name].append(np.sqrt(mean_squared_error(y_test, y_pred)))
scores['MSE'][name].append(mean_squared_error(y_test, y_pred))
scores['R²'][name].append(r2_score(y_test, y_pred))
scores['MedAPE'][name].append(medape(y_test, y_pred))
scores['SMAPE'][name].append(smape(np.expm1(y_test), np.expm1(y_pred)))
# Prepare data for LSTM
X_train_seq = X_train_scaled.reshape(-1, 1, X_train_scaled.shape[1])
X_test_seq = X_test_scaled.reshape(-1, 1, X_test_scaled.shape[1])
train_tensor = torch.tensor(X_train_seq, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values.reshape(-1, 1), dtype=torch.float32)
test_tensor = torch.tensor(X_test_seq, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values.reshape(-1, 1), dtype=torch.float32)
# Split train into train/val sets
train_ds, val_ds = random_split(
TensorDataset(train_tensor, y_train_tensor),
[int(0.8 * len(train_tensor)), len(train_tensor) - int(0.8 * len(train_tensor))]
)
train_loader = DataLoader(train_ds, batch_size=lstm_best_params['batch_size'], shuffle=True)
val_loader = DataLoader(val_ds, batch_size=lstm_best_params['batch_size'])
# Instantiate and train LSTM
lstm = LSTMRegressor(
input_size=train_tensor.shape[2],
hidden_size=lstm_best_params['hidden_size'],
num_layers=lstm_best_params['num_layers'],
activation=lstm_best_params['activation']
)
lstm = train_model(lstm, train_loader, val_loader, lr=lstm_best_params['learning_rate'])
lstm.eval()
with torch.no_grad():
y_pred_lstm = torch.clamp(lstm(test_tensor), min=0).squeeze().numpy()
scores['RMSE']['LSTM (PyTorch)'].append(np.sqrt(mean_squared_error(y_test, y_pred_lstm)))
scores['MSE']['LSTM (PyTorch)'].append(mean_squared_error(y_test, y_pred_lstm))
scores['R²']['LSTM (PyTorch)'].append(r2_score(y_test, y_pred_lstm))
scores['MedAPE']['LSTM (PyTorch)'].append(medape(y_test, y_pred_lstm))
scores['SMAPE']['LSTM (PyTorch)'].append(smape(np.expm1(y_test), np.expm1(y_pred_lstm)))
# Average performance report
print("\nAverage Cross-Validation Performance:")
perf_df = pd.DataFrame({
'Model': list(models.keys()),
'Avg RMSE': [np.mean(scores['RMSE'][m]) for m in models],
'Avg MSE': [np.mean(scores['MSE'][m]) for m in models],
'Avg R²': [np.mean(scores['R²'][m]) for m in models],
'Avg MedAPE (%)': [np.mean(scores['MedAPE'][m]) for m in models],
'Avg SMAPE (%)': [np.mean(scores['SMAPE'][m]) for m in models],
})
print(perf_df.to_string(index=False, float_format="%.4f"))
# Plot results
fig, axes = plt.subplots(1, 5, figsize=(30, 6))
for i, metric in enumerate(perf_df.columns[1:]):
sns.barplot(data=perf_df, x='Model', y=metric, ax=axes[i], palette='Set2')
axes[i].set_title(f"{metric} ({k}-Fold)")
axes[i].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
# Blind set evaluation
print(f"\n--- Evaluating Blind Set After {k}-Fold ---")
X_blind_scaled = scaler.transform(X_blind_selected)
blind_metrics = {metric: {} for metric in metrics}
for name, model in models.items():
if name == 'LSTM (PyTorch)':
X_blind_seq = X_blind_scaled.reshape(-1, 1, X_blind_scaled.shape[1])
X_blind_tensor = torch.tensor(X_blind_seq, dtype=torch.float32)
lstm.eval()
with torch.no_grad():
y_blind_pred = torch.clamp(lstm(X_blind_tensor), min=0).squeeze().numpy()
else:
y_blind_pred = np.maximum(model.predict(X_blind_scaled), 0)
blind_metrics['RMSE'][name] = np.sqrt(mean_squared_error(rdkit_blind['log_ic50'], y_blind_pred))
blind_metrics['MSE'][name] = mean_squared_error(rdkit_blind['log_ic50'], y_blind_pred)
blind_metrics['R²'][name] = r2_score(rdkit_blind['log_ic50'], y_blind_pred)
blind_metrics['MedAPE'][name] = medape(rdkit_blind['log_ic50'], y_blind_pred)
blind_metrics['SMAPE'][name] = smape(np.expm1(rdkit_blind['log_ic50']), np.expm1(y_blind_pred))
blind_df = pd.DataFrame({
'Model': list(models.keys()),
'RMSE': [blind_metrics['RMSE'][m] for m in models],
'MSE': [blind_metrics['MSE'][m] for m in models],
'R²': [blind_metrics['R²'][m] for m in models],
'MedAPE (%)': [blind_metrics['MedAPE'][m] for m in models],
'SMAPE (%)': [blind_metrics['SMAPE'][m] for m in models],
})
print(blind_df.to_string(index=False, float_format="%.4f"))
fig, axes = plt.subplots(1, 5, figsize=(30, 6))
for i, metric in enumerate(blind_df.columns[1:]):
sns.barplot(data=blind_df, x='Model', y=metric, ax=axes[i], palette='Set1')
axes[i].set_title(f"{metric} (Blind Set - {k}-Fold)")
axes[i].tick_params(axis='x', rotation=45)
axes[i].set_ylabel('')
axes[i].set_xlabel('')
plt.tight_layout()
plt.show()
Training features shape: (2760, 195)
Blind features shape: (691, 195)
--- 2-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.8290 0.6873 0.7177 18.6466 79.9779
Gradient Boosting 0.8420 0.7089 0.7088 19.0893 81.8591
SVM 0.8628 0.7445 0.6940 18.6059 81.7704
LSTM (PyTorch) 0.8583 0.7367 0.6975 18.1648 81.4304
--- Evaluating Blind Set After 2-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8233 0.6778 0.7161 19.0168 81.7635
Gradient Boosting 0.8464 0.7163 0.6999 20.0592 83.4906
SVM 0.8465 0.7166 0.6998 18.7351 82.8707
LSTM (PyTorch) 0.8325 0.6930 0.7097 19.2133 84.2816
--- 3-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.8021 0.6436 0.7357 17.6069 78.7549
Gradient Boosting 0.8110 0.6578 0.7299 18.6044 80.8307
SVM 0.8272 0.6844 0.7190 18.2495 80.3841
LSTM (PyTorch) 0.8341 0.6962 0.7142 17.6261 79.2042
--- Evaluating Blind Set After 3-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8146 0.6636 0.7220 18.2866 80.7609
Gradient Boosting 0.8260 0.6823 0.7142 19.5453 80.9937
SVM 0.8337 0.6950 0.7089 18.4223 82.0819
LSTM (PyTorch) 0.8282 0.6860 0.7127 19.3857 82.6379
--- 4-Fold Cross Validation ---
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 0.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 5.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 11.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 5.6s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 5.6s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 16.8s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 11.1s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 16.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 16.7s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.1s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 21.5s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 13.4s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 6.7s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 6.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 13.1s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.2s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.7s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 23.0s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.2s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 7.1s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 6.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 14.0s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.5s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 8.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 23.8s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.6s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 7.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 7.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 13.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 7.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.2s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 0.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 5.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 11.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 5.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 11.1s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 16.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 16.6s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 10.9s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.7s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.7s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.8s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 14.4s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 21.4s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 20.3s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 19.9s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.1s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 19.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.0s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.7s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 15.5s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.3s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.2s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 21.4s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 21.0s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 15.8s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.3s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.5s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 21.8s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 21.2s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 11.4s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 17.0s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 16.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 11.2s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 5.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 11.0s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 16.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.0s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.2s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 21.7s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 13.5s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 6.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 13.2s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 19.8s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 19.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 15.4s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 23.1s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 21.7s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 20.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 20.5s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 8.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 23.6s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.4s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 21.6s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 21.1s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 11.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 17.2s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 16.9s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 16.6s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 11.0s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 16.7s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.0s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.1s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 14.3s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 6.8s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 6.7s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 20.0s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 13.3s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 19.9s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 19.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.0s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.8s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 23.5s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.2s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 6.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 14.0s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 20.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 20.4s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.5s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.1s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 16.1s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.3s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.5s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.8s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 7.1s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 7.2s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 14.3s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 7.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.7s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 5.8s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 17.2s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 11.1s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 5.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 5.6s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 11.0s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 5.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 5.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 11.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.7s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.7s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.0s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.8s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.1s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 21.3s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 13.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 20.2s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 20.0s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.2s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.2s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.0s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 15.4s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 23.5s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 21.4s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 20.8s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.8s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 20.5s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.1s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 23.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.6s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 7.1s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 7.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 14.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 7.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 5.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 11.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 5.6s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 5.6s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 16.8s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 11.1s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 16.8s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 16.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.0s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.1s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 14.3s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 6.7s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 6.7s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 20.2s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 13.4s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 19.9s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 19.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.0s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.7s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 15.4s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.0s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.1s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 21.3s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 14.0s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 20.7s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 20.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.1s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.8s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 15.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.4s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.4s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 21.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 21.5s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 5.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 11.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 5.6s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 11.2s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 16.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 16.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 11.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.7s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.0s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.2s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 14.3s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 6.7s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 13.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 20.3s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 19.9s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.2s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.8s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 15.4s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.2s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.0s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 21.1s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 14.1s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 20.8s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 20.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.1s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 16.1s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.3s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.3s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.6s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 21.6s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 21.4s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 20.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.4s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 5.8s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 17.3s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 11.2s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 16.8s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 16.9s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 16.8s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 14.4s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 21.7s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 20.5s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 19.9s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.4s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 19.9s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.0s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.8s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 15.7s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.3s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.2s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 21.5s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 20.8s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 20.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 23.8s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.7s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 7.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 7.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 14.4s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 7.1s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 20.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 11.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 17.2s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 16.8s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 16.8s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 11.1s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 16.7s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.2s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 14.3s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 6.8s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 13.4s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 20.5s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 20.1s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.3s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.2s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.2s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.0s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 23.6s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.1s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 7.0s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 13.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 21.3s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 20.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.1s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 24.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.5s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 7.1s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 14.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 21.3s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 20.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 0.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 5.8s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 17.2s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 11.2s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 5.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 5.6s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 11.1s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 5.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 5.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 16.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.8s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.8s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.2s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 21.4s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 13.8s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 6.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 6.7s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 13.3s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.5s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 19.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.0s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.0s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 15.5s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 23.1s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 21.3s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 13.8s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.8s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.7s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 20.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 8.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 24.1s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.3s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 7.2s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 14.2s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 21.3s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 20.2s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 11.4s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 5.6s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 5.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 11.2s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 16.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 16.6s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 11.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.7s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.7s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.0s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.8s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 14.2s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 21.4s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 20.2s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 13.2s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.1s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 19.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.7s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 23.1s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.3s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 21.1s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 21.0s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 8.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 15.6s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.3s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.1s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 21.6s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 14.1s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 21.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 18.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 11.6s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 5.6s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 5.6s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 11.2s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 5.6s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 11.1s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 16.6s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 16.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.0s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.8s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.2s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 21.8s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 13.5s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 6.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 13.4s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 20.1s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 19.9s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 15.7s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.2s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.2s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.3s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 7.1s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 7.0s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 14.0s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.8s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.5s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 8.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 15.7s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.2s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.3s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 21.5s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 14.4s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 21.1s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 19.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 11.4s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 17.2s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 16.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 11.1s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 5.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 11.2s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 16.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.0s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.8s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.1s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 21.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 13.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 6.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 6.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 13.3s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.5s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 19.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.5s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.0s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 15.5s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 23.2s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 21.4s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 13.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.5s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 20.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.5s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.1s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 15.7s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 23.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 21.4s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 14.1s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 19.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.8s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 0.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 5.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 17.2s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 11.2s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 5.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 5.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 11.1s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 5.6s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 5.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 11.1s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.2s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.4s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.0s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.0s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.8s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 14.4s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 21.5s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 20.2s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 13.2s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.5s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.0s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 19.9s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.8s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 23.2s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.3s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 6.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 7.0s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 13.8s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.8s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.5s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.1s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 15.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 23.5s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 21.8s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 14.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 7.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.7s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 19.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 0.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.8s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.2s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 5.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 17.2s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 11.2s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 5.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 5.4s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 11.0s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 5.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 5.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 11.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.7s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.8s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 14.5s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 6.7s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 6.7s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 13.5s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 20.1s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 19.7s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.2s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.2s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 7.8s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 23.3s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.2s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 7.0s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 6.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 13.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.8s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.1s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 15.8s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 23.8s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 21.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 21.1s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 14.0s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 18.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.4s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 5.8s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 17.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 11.3s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 5.6s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 11.2s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 16.8s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 16.7s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.5s
[CV] END max_depth=15, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.4s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.0s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 0.9s
[CV] END max_depth=15, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.8s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 14.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 6.8s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 6.8s
[CV] END max_depth=15, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 13.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 6.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 6.7s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 13.4s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 6.6s
[CV] END max_depth=15, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 13.2s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.2s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.3s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.8s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.5s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.0s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.3s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 15.7s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.2s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 7.2s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 14.1s
[CV] END max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 21.5s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 21.2s
[CV] END max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 20.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.7s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.6s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 0.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 1.1s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 0.6s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 0.3s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 0.7s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 0.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 15.9s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time= 23.8s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 22.1s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 21.4s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 14.4s
[CV] END max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 19.0s
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7916 0.6268 0.7426 17.1389 77.3645
Gradient Boosting 0.8078 0.6527 0.7317 18.3958 80.1055
SVM 0.8199 0.6722 0.7238 17.6031 79.7897
LSTM (PyTorch) 0.8190 0.6709 0.7242 17.2603 78.8675
--- Evaluating Blind Set After 4-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8040 0.6464 0.7292 17.5506 79.6499
Gradient Boosting 0.8224 0.6763 0.7167 17.9610 81.5546
SVM 0.8325 0.6931 0.7097 18.1064 81.2555
LSTM (PyTorch) 0.8112 0.6580 0.7244 17.8390 80.7993
--- 5-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7870 0.6195 0.7453 17.0895 77.0642
Gradient Boosting 0.8045 0.6474 0.7337 18.4781 80.1997
SVM 0.8144 0.6636 0.7271 17.4957 79.4142
LSTM (PyTorch) 0.8119 0.6599 0.7287 17.6522 79.0241
--- Evaluating Blind Set After 5-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8125 0.6601 0.7235 16.9374 79.5787
Gradient Boosting 0.8204 0.6730 0.7181 18.8345 81.3931
SVM 0.8316 0.6916 0.7103 17.3724 80.8095
LSTM (PyTorch) 0.8209 0.6739 0.7177 17.5415 81.5131
--- 6-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7830 0.6131 0.7476 16.9576 76.6907
Gradient Boosting 0.8059 0.6498 0.7320 18.3794 79.9894
SVM 0.8104 0.6569 0.7293 17.4967 79.2315
LSTM (PyTorch) 0.8079 0.6537 0.7311 17.5491 77.8070
--- Evaluating Blind Set After 6-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.7972 0.6356 0.7337 16.4811 78.6011
Gradient Boosting 0.8251 0.6808 0.7148 18.2088 81.5747
SVM 0.8286 0.6866 0.7124 16.7300 80.6265
LSTM (PyTorch) 0.7969 0.6350 0.7340 17.6807 79.7223
--- 7-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7831 0.6136 0.7476 16.9895 76.5393
Gradient Boosting 0.8060 0.6500 0.7323 18.3578 79.8213
SVM 0.8100 0.6563 0.7298 17.1677 79.0237
LSTM (PyTorch) 0.8233 0.6786 0.7210 17.6855 78.9346
--- Evaluating Blind Set After 7-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.7961 0.6338 0.7345 16.5858 78.6456
Gradient Boosting 0.8221 0.6759 0.7169 19.0763 81.2620
SVM 0.8275 0.6847 0.7132 16.7307 80.1446
LSTM (PyTorch) 0.8210 0.6740 0.7177 18.3176 79.7620
--- 8-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7812 0.6111 0.7488 16.4537 75.9685
Gradient Boosting 0.8079 0.6541 0.7305 18.5037 79.8927
SVM 0.8121 0.6602 0.7284 17.0989 78.8937
LSTM (PyTorch) 0.8069 0.6518 0.7319 17.1280 77.3170
--- Evaluating Blind Set After 8-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.7907 0.6252 0.7381 15.9084 77.7957
Gradient Boosting 0.8052 0.6483 0.7284 17.3009 80.4722
SVM 0.8268 0.6836 0.7137 16.6339 79.8332
LSTM (PyTorch) 0.7971 0.6354 0.7338 18.6228 80.0632
--- 9-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7783 0.6066 0.7504 16.9517 76.5251
Gradient Boosting 0.8049 0.6485 0.7325 18.2784 79.8958
SVM 0.8090 0.6551 0.7305 17.2808 78.9408
LSTM (PyTorch) 0.8077 0.6534 0.7314 17.2245 78.0919
--- Evaluating Blind Set After 9-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.7915 0.6264 0.7376 16.3696 78.3048
Gradient Boosting 0.8181 0.6693 0.7196 18.1840 80.8315
SVM 0.8221 0.6758 0.7169 16.8559 79.8022
LSTM (PyTorch) 0.7949 0.6319 0.7353 17.3417 80.1636
--- 10-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7813 0.6112 0.7484 16.5218 75.9953
Gradient Boosting 0.8085 0.6547 0.7303 18.0636 79.9365
SVM 0.8086 0.6551 0.7303 17.3441 78.7169
LSTM (PyTorch) 0.8072 0.6527 0.7315 16.8854 77.3670
--- Evaluating Blind Set After 10-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.7908 0.6253 0.7381 15.6751 78.2311
Gradient Boosting 0.8255 0.6815 0.7145 18.4551 80.8686
SVM 0.8220 0.6756 0.7170 16.8226 79.8207
LSTM (PyTorch) 0.7960 0.6336 0.7346 17.8041 79.7241
Overfitting/Underfitting explananation in Kfold All models in RDkit¶
In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, random_split
# === Metrics ===
def medape(y_true, y_pred, epsilon=1e-3):
y_true, y_pred = np.array(y_true), np.array(y_pred)
mask = y_true > epsilon
if np.sum(mask) == 0:
return np.nan
return np.median(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100
def smape(y_true, y_pred, epsilon=1e-3):
y_true, y_pred = np.array(y_true), np.array(y_pred)
denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
denominator = np.where(denominator < epsilon, epsilon, denominator)
return np.mean(np.abs(y_true - y_pred) / denominator) * 100
# === LSTM model ===
class LSTMRegressor(nn.Module):
def __init__(self, input_size, hidden_size=64, num_layers=2, activation='relu'):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
activations = {
'relu': nn.ReLU(),
'tanh': nn.Tanh(),
'sigmoid': nn.Sigmoid(),
'leaky_relu': nn.LeakyReLU()
}
self.activation = activations.get(activation, None)
self.fc = nn.Linear(hidden_size, 1)
def forward(self, x):
out, _ = self.lstm(x)
out = out[:, -1, :]
out = self.activation(out) if self.activation else out
return self.fc(out)
def train_model(model, train_loader, val_loader, epochs=200, patience=20, lr=0.001):
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
best_loss = float('inf')
best_state = None
patience_counter = 0
for epoch in range(epochs):
model.train()
for xb, yb in train_loader:
optimizer.zero_grad()
loss = criterion(model(xb), yb)
loss.backward()
optimizer.step()
model.eval()
val_losses = []
with torch.no_grad():
for xb, yb in val_loader:
val_loss = criterion(model(xb), yb)
val_losses.append(val_loss.item())
avg_val_loss = np.mean(val_losses)
if avg_val_loss < best_loss:
best_loss = avg_val_loss
best_state = model.state_dict()
patience_counter = 0
else:
patience_counter += 1
if patience_counter >= patience:
break
model.load_state_dict(best_state)
return model
# === Prepare Data ===
y = rdkit_train['log_ic50']
X = rdkit_train.drop(columns=[col for col in drop_cols if col in rdkit_train.columns])
X = X.replace(r'^\s*$', np.nan, regex=True).apply(pd.to_numeric, errors='coerce').fillna(X.mean())
X_selected = X[features_filtered]
X_blind = rdkit_blind.drop(columns=[col for col in drop_cols if col in rdkit_blind.columns])
X_blind = X_blind.replace(r'^\s*$', np.nan, regex=True).apply(pd.to_numeric, errors='coerce').fillna(X_blind.mean())
X_blind_selected = X_blind[features_filtered]
# === Models ===
models = {
'Random Forest': RandomForestRegressor(**rf_best_params),
'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, random_state=42),
'SVM': SVR(),
'LSTM (PyTorch)': None
}
metrics = ['RMSE', 'MSE', 'R²', 'MedAPE', 'SMAPE']
# === Cross-validation with Overfitting Detection ===
kf = KFold(n_splits=9, shuffle=True, random_state=42)
scores = {metric: {m: [] for m in models} for metric in metrics}
train_rmse_hist, val_rmse_hist = {m: [] for m in models}, {m: [] for m in models}
for train_idx, test_idx in kf.split(X_selected):
X_train, X_test = X_selected.iloc[train_idx], X_selected.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
for name, model in models.items():
if name == 'LSTM (PyTorch)':
continue
model.fit(X_train_scaled, y_train)
y_train_pred = np.maximum(model.predict(X_train_scaled), 0)
y_val_pred = np.maximum(model.predict(X_test_scaled), 0)
train_rmse_hist[name].append(np.sqrt(mean_squared_error(y_train, y_train_pred)))
val_rmse_hist[name].append(np.sqrt(mean_squared_error(y_test, y_val_pred)))
scores['RMSE'][name].append(train_rmse_hist[name][-1])
scores['MSE'][name].append(mean_squared_error(y_test, y_val_pred))
scores['R²'][name].append(r2_score(y_test, y_val_pred))
scores['MedAPE'][name].append(medape(y_test, y_val_pred))
scores['SMAPE'][name].append(smape(np.expm1(y_test), np.expm1(y_val_pred)))
# LSTM processing...
X_train_seq = X_train_scaled.reshape(-1, 1, X_train_scaled.shape[1])
X_test_seq = X_test_scaled.reshape(-1, 1, X_test_scaled.shape[1])
train_tensor = torch.tensor(X_train_seq, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values.reshape(-1, 1), dtype=torch.float32)
test_tensor = torch.tensor(X_test_seq, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values.reshape(-1, 1), dtype=torch.float32)
train_ds, val_ds = random_split(
TensorDataset(train_tensor, y_train_tensor),
[int(0.8 * len(train_tensor)), len(train_tensor) - int(0.8 * len(train_tensor))]
)
train_loader = DataLoader(train_ds, batch_size=lstm_best_params['batch_size'], shuffle=True)
val_loader = DataLoader(val_ds, batch_size=lstm_best_params['batch_size'])
lstm = LSTMRegressor(
input_size=train_tensor.shape[2],
hidden_size=lstm_best_params['hidden_size'],
num_layers=lstm_best_params['num_layers'],
activation=lstm_best_params['activation']
)
lstm = train_model(lstm, train_loader, val_loader, lr=lstm_best_params['learning_rate'])
lstm.eval()
with torch.no_grad():
y_train_pred_lstm = torch.clamp(lstm(train_tensor), min=0).squeeze().numpy()
y_val_pred_lstm = torch.clamp(lstm(test_tensor), min=0).squeeze().numpy()
train_rmse_hist['LSTM (PyTorch)'].append(np.sqrt(mean_squared_error(y_train, y_train_pred_lstm)))
val_rmse_hist['LSTM (PyTorch)'].append(np.sqrt(mean_squared_error(y_test, y_val_pred_lstm)))
scores['RMSE']['LSTM (PyTorch)'].append(train_rmse_hist['LSTM (PyTorch)'][-1])
scores['MSE']['LSTM (PyTorch)'].append(mean_squared_error(y_test, y_val_pred_lstm))
scores['R²']['LSTM (PyTorch)'].append(r2_score(y_test, y_val_pred_lstm))
scores['MedAPE']['LSTM (PyTorch)'].append(medape(y_test, y_val_pred_lstm))
scores['SMAPE']['LSTM (PyTorch)'].append(smape(np.expm1(y_test), np.expm1(y_val_pred_lstm)))
# === Overfitting / Underfitting Plot ===
plt.figure(figsize=(10, 6))
for model in models:
plt.plot(train_rmse_hist[model], label=f"{model} - Train", linestyle='--')
plt.plot(val_rmse_hist[model], label=f"{model} - Val")
plt.xlabel("Fold")
plt.ylabel("RMSE")
plt.title("Overfitting / Underfitting Detection")
# Place legend outside plot, top right
plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1), borderaxespad=0.)
plt.tight_layout()
plt.show()
# === Best Model Plot ===
avg_val_rmse = {m: np.mean(val_rmse_hist[m]) for m in models}
best_model = min(avg_val_rmse, key=avg_val_rmse.get)
plt.figure(figsize=(8, 5))
sns.barplot(x=list(avg_val_rmse.keys()), y=list(avg_val_rmse.values()), palette='Set2')
plt.axhline(avg_val_rmse[best_model], color='red', linestyle='--', label=f"Best: {best_model}")
plt.ylabel("Avg Validation RMSE")
plt.title("Best Model Comparison")
plt.legend()
plt.show()
Best Random Forest Model in K fold cross validation with Overfitting/Underfitting¶
In [74]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
# === Create output directory ===
output_dir = "/Users/janat/Desktop/ppimic50pred/model"
os.makedirs(output_dir, exist_ok=True)
# === Custom Metrics ===
def medape(y_true, y_pred, epsilon=1e-3):
y_true, y_pred = np.array(y_true), np.array(y_pred)
mask = y_true > epsilon
if np.sum(mask) == 0:
return np.nan
return np.median(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100
def smape(y_true, y_pred, epsilon=1e-3):
y_true, y_pred = np.array(y_true), np.array(y_pred)
denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
denominator = np.where(denominator < epsilon, epsilon, denominator)
return np.mean(np.abs(y_true - y_pred) / denominator) * 100
# === Prepare Data ===
drop_cols = ['chembl_id', 'smiles', 'ic50_value', 'log_ic50',
'target_chembl_id', 'target_name']
y = rdkit_train['log_ic50']
# Clean training features
X = rdkit_train.drop(columns=[col for col in drop_cols if col in rdkit_train.columns])
X = X.replace(r'^\s*$', np.nan, regex=True)
X = X.apply(pd.to_numeric, errors='coerce')
X = X.fillna(X.mean())
# Select fixed important features only
X_selected = X[features_filtered]
# Prepare blind test set similarly
X_blind = rdkit_blind.drop(columns=[col for col in drop_cols if col in rdkit_blind.columns])
X_blind = X_blind.replace(r'^\s*$', np.nan, regex=True)
X_blind = X_blind.apply(pd.to_numeric, errors='coerce')
X_blind = X_blind.fillna(X_blind.mean())
X_blind_selected = X_blind[features_filtered]
# === Model ===
rf = RandomForestRegressor(**rf_best_params)
# === Cross-validation ===
kf = KFold(n_splits=9, shuffle=True, random_state=42)
train_r2_hist, val_r2_hist = [], []
scores = {"RMSE": [], "MSE": [], "R²": [], "MedAPE": [], "SMAPE": []}
# Store last fold data for plotting true vs predicted
y_train_last, y_train_pred_last = None, None
y_val_last, y_val_pred_last = None, None
for train_idx, val_idx in kf.split(X_selected):
X_train, X_val = X_selected.iloc[train_idx], X_selected.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
rf.fit(X_train_scaled, y_train)
y_train_pred = np.maximum(rf.predict(X_train_scaled), 0)
y_val_pred = np.maximum(rf.predict(X_val_scaled), 0)
train_r2 = r2_score(y_train, y_train_pred)
val_r2 = r2_score(y_val, y_val_pred)
train_r2_hist.append(train_r2)
val_r2_hist.append(val_r2)
scores["RMSE"].append(np.sqrt(mean_squared_error(y_val, y_val_pred)))
scores["MSE"].append(mean_squared_error(y_val, y_val_pred))
scores["R²"].append(val_r2)
scores["MedAPE"].append(medape(y_val, y_val_pred))
scores["SMAPE"].append(smape(np.expm1(y_val), np.expm1(y_val_pred)))
# Save last fold results for plotting
y_train_last, y_train_pred_last = y_train, y_train_pred
y_val_last, y_val_pred_last = y_val, y_val_pred
# === Average CV Performance ===
perf_df = pd.DataFrame({
"Metric": list(scores.keys()),
"Mean": [np.mean(scores[m]) for m in scores],
"Std": [np.std(scores[m]) for m in scores]
})
print("\nAverage CV Performance (Random Forest):")
print(perf_df.to_string(index=False, float_format="%.4f"))
perf_df.to_csv(os.path.join(output_dir, "cv_performance.csv"), index=False)
# === Retrain Final Model on Full Data ===
scaler_final = StandardScaler()
X_scaled_full = scaler_final.fit_transform(X_selected)
rf.fit(X_scaled_full, y)
joblib.dump({"model": rf, "scaler": scaler_final},
os.path.join(output_dir, "random_forest_model.pkl"))
# === Overfitting / Underfitting Plot (R²) ===
plt.figure(figsize=(8, 5))
plt.plot(train_r2_hist, label="Train R²", linestyle='--', marker='o')
plt.plot(val_r2_hist, label="Validation R²", marker='o')
plt.xlabel("Fold")
plt.ylabel("R² Score")
plt.title("Random Forest - Overfitting/Underfitting Check")
plt.legend(loc='upper left', bbox_to_anchor=(1.02, 1), borderaxespad=0)
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "overfitting_underfitting_plot.png"))
plt.show()
# === True vs Predicted Plots (Last Fold) ===
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(y_train_last, y_train_pred_last, alpha=0.5)
plt.plot([min(y_train_last), max(y_train_last)],
[min(y_train_last), max(y_train_last)], 'r--')
plt.title("Train: True vs Predicted")
plt.xlabel("True log_ic50")
plt.ylabel("Predicted log_ic50")
plt.subplot(1, 2, 2)
plt.scatter(y_val_last, y_val_pred_last, alpha=0.5)
plt.plot([min(y_val_last), max(y_val_last)],
[min(y_val_last), max(y_val_last)], 'r--')
plt.title("Validation: True vs Predicted")
plt.xlabel("True log_ic50")
plt.ylabel("Predicted log_ic50")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "true_vs_predicted.png"))
plt.show()
Average CV Performance (Random Forest):
Metric Mean Std
RMSE 0.7777 0.0257
MSE 0.6054 0.0403
R² 0.7508 0.0156
MedAPE 16.5070 1.1767
SMAPE 76.1879 3.3553
Random Forest Model using RDkit features if just using train_test_split : 80% training and 20% testing¶
In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# Prepare target variable
y = rdkit_train['log_ic50']
# Define columns to drop if present
drop_cols = [
'chembl_id', 'smiles', 'ic50_value', 'log_ic50', 'target_chembl_id', 'target_name'
]
# Drop unwanted columns safely
X = rdkit_train.drop(columns=[col for col in drop_cols if col in rdkit_train.columns])
# Clean features as in your pipeline (example: convert to numeric, handle NaNs)
X = X.replace(r'^\s*$', np.nan, regex=True)
X = X.apply(pd.to_numeric, errors='coerce')
X = X.replace([np.inf, -np.inf], np.nan)
X = X.clip(lower=-1e6, upper=1e6)
X = X.fillna(X.mean())
# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Initialize Random Forest Regressor (use your best params)
reg = RandomForestRegressor(**rf_best_params)
reg.fit(X_train, y_train)
# Function to check overfitting or underfitting
def check_overfitting_underfitting(model, X_train, y_train, X_val, y_val):
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
mse_train = mean_squared_error(y_train, y_train_pred)
mse_val = mean_squared_error(y_val, y_val_pred)
r2_train = r2_score(y_train, y_train_pred)
r2_val = r2_score(y_val, y_val_pred)
print(f"Training MSE: {mse_train:.4f}")
print(f"Validation MSE: {mse_val:.4f}")
print(f"Training R²: {r2_train:.4f}")
print(f"Validation R²: {r2_val:.4f}")
if r2_train > 0.8 and r2_val < 0.5:
print("\nWarning: Possible Overfitting detected (high train score, low val score).")
elif r2_train < 0.5 and r2_val < 0.5:
print("\nWarning: Possible Underfitting detected (both train and val scores are low).")
else:
print("\nModel seems reasonably fit.")
# Plot true vs predicted
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.scatter(y_train, y_train_pred, alpha=0.5)
plt.plot([min(y_train), max(y_train)], [min(y_train), max(y_train)], 'r--')
plt.title("Train: True vs Predicted")
plt.xlabel("True log_ic50")
plt.ylabel("Predicted log_ic50")
plt.subplot(1, 2, 2)
plt.scatter(y_val, y_val_pred, alpha=0.5)
plt.plot([min(y_val), max(y_val)], [min(y_val), max(y_val)], 'r--')
plt.title("Validation: True vs Predicted")
plt.xlabel("True log_ic50")
plt.ylabel("Predicted log_ic50")
plt.tight_layout()
plt.show()
# Run the check
check_overfitting_underfitting(reg, X_train, y_train, X_val, y_val)
Training MSE: 0.1239 Validation MSE: 0.5674 Training R²: 0.9491 Validation R²: 0.7671 Model seems reasonably fit.
In [63]:
lstm_best_params
Out[63]:
{'hidden_size': 128,
'num_layers': 1,
'learning_rate': 0.0005,
'batch_size': 64,
'activation': 'leaky_relu'}
In [64]:
rf_best_params
Out[64]:
{'max_depth': None,
'max_features': None,
'min_samples_leaf': 2,
'min_samples_split': 2,
'n_estimators': 200}
Feature selection: Variance Threshold¶
Variance Threshold feature selection and Blind set Validation in Kfold cross validation, with Tunned parameters¶
VarianceThreshold(0.01)¶
In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, random_split
# === Metrics ===
def medape(y_true, y_pred, epsilon=1e-3):
y_true, y_pred = np.array(y_true), np.array(y_pred)
mask = y_true > epsilon
return np.median(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100 if np.sum(mask) else np.nan
def smape(y_true, y_pred, epsilon=1e-3):
y_true, y_pred = np.array(y_true), np.array(y_pred)
denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
denominator = np.where(denominator < epsilon, epsilon, denominator)
return np.mean(np.abs(y_true - y_pred) / denominator) * 100
# === LSTM ===
class LSTMRegressor(nn.Module):
def __init__(self, input_size, hidden_size=64, num_layers=2, activation='relu'):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.activation = {'relu': nn.ReLU(), 'tanh': nn.Tanh(), 'sigmoid': nn.Sigmoid(), 'leaky_relu': nn.LeakyReLU()}.get(activation)
self.fc = nn.Linear(hidden_size, 1)
def forward(self, x):
out, _ = self.lstm(x)
out = out[:, -1, :]
return self.fc(self.activation(out)) if self.activation else self.fc(out)
def train_model(model, train_loader, val_loader, epochs=200, patience=20, lr=0.001):
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
best_loss, best_state, patience_counter = float('inf'), None, 0
for epoch in range(epochs):
model.train()
for xb, yb in train_loader:
optimizer.zero_grad()
loss = criterion(model(xb), yb)
loss.backward()
optimizer.step()
model.eval()
val_loss = np.mean([criterion(model(xb), yb).item() for xb, yb in val_loader])
if val_loss < best_loss:
best_loss, best_state = val_loss, model.state_dict()
patience_counter = 0
else:
patience_counter += 1
if patience_counter >= patience:
break
model.load_state_dict(best_state)
return model
# === Load your data ===
# rdkit_train = pd.read_csv('rdkit_train.csv')
# rdkit_blind = pd.read_csv('rdkit_blind.csv')
# === Parameters ===
drop_cols = ['chembl_id', 'smiles', 'ic50_value', 'log_ic50', 'target_chembl_id', 'target_name']
#rf_best_params = {'n_estimators': 200, 'random_state': 42}
#lstm_best_params = {'hidden_size': 64, 'num_layers': 2, 'activation': 'relu', 'learning_rate': 0.001, 'batch_size': 32}
metrics = ['RMSE', 'MSE', 'R²', 'MedAPE', 'SMAPE']
# === Prepare Training Set ===
y = rdkit_train['log_ic50']
X = rdkit_train.drop(columns=[col for col in drop_cols if col in rdkit_train.columns])
X = X.replace(r'^\s*$', np.nan, regex=True).apply(pd.to_numeric, errors='coerce').fillna(X.mean())
selector = VarianceThreshold(0.01)
X_selected = selector.fit_transform(X)
# === Prepare Blind Set ===
X_blind = rdkit_blind.drop(columns=[col for col in drop_cols if col in rdkit_blind.columns])
X_blind = X_blind.replace(r'^\s*$', np.nan, regex=True).apply(pd.to_numeric, errors='coerce').fillna(X_blind.mean())
X_blind_selected = selector.transform(X_blind)
# === Models ===
models = {
'Random Forest': RandomForestRegressor(**rf_best_params),
'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, random_state=42),
'SVM': SVR(),
'LSTM (PyTorch)': None # will instantiate per fold
}
# === Cross-validation and Blind Evaluation ===
for k in range(2, 11):
print(f"\n--- {k}-Fold Cross Validation ---")
kf = KFold(n_splits=k, shuffle=True, random_state=42)
scores = {metric: {m: [] for m in models} for metric in metrics}
for train_idx, test_idx in kf.split(X_selected):
X_train, X_test = X_selected[train_idx], X_selected[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Classic models
for name, model in models.items():
if name == 'LSTM (PyTorch)':
continue
model.fit(X_train_scaled, y_train)
y_pred = np.maximum(model.predict(X_test_scaled), 0)
scores['RMSE'][name].append(np.sqrt(mean_squared_error(y_test, y_pred)))
scores['MSE'][name].append(mean_squared_error(y_test, y_pred))
scores['R²'][name].append(r2_score(y_test, y_pred))
scores['MedAPE'][name].append(medape(y_test, y_pred))
scores['SMAPE'][name].append(smape(np.expm1(y_test), np.expm1(y_pred)))
# LSTM
X_train_seq = X_train_scaled.reshape(-1, 1, X_train_scaled.shape[1])
X_test_seq = X_test_scaled.reshape(-1, 1, X_test_scaled.shape[1])
train_tensor = torch.tensor(X_train_seq, dtype=torch.float32)
test_tensor = torch.tensor(X_test_seq, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values.reshape(-1, 1), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values.reshape(-1, 1), dtype=torch.float32)
train_ds, val_ds = random_split(TensorDataset(train_tensor, y_train_tensor), [int(0.8*len(train_tensor)), len(train_tensor) - int(0.8*len(train_tensor))])
train_loader = DataLoader(train_ds, batch_size=lstm_best_params['batch_size'], shuffle=True)
val_loader = DataLoader(val_ds, batch_size=lstm_best_params['batch_size'])
lstm = LSTMRegressor(
input_size=train_tensor.shape[2],
hidden_size=lstm_best_params['hidden_size'],
num_layers=lstm_best_params['num_layers'],
activation=lstm_best_params['activation']
)
lstm = train_model(lstm, train_loader, val_loader, lr=lstm_best_params['learning_rate'])
lstm.eval()
with torch.no_grad():
y_pred_lstm = torch.clamp(lstm(test_tensor), min=0).squeeze().numpy()
scores['RMSE']['LSTM (PyTorch)'].append(np.sqrt(mean_squared_error(y_test, y_pred_lstm)))
scores['MSE']['LSTM (PyTorch)'].append(mean_squared_error(y_test, y_pred_lstm))
scores['R²']['LSTM (PyTorch)'].append(r2_score(y_test, y_pred_lstm))
scores['MedAPE']['LSTM (PyTorch)'].append(medape(y_test, y_pred_lstm))
scores['SMAPE']['LSTM (PyTorch)'].append(smape(np.expm1(y_test), np.expm1(y_pred_lstm)))
# === Fold-wise Average Report ===
print("\nAverage Cross-Validation Performance:")
perf_df = pd.DataFrame({
'Model': list(models.keys()),
'Avg RMSE': [np.mean(scores['RMSE'][m]) for m in models],
'Avg MSE': [np.mean(scores['MSE'][m]) for m in models],
'Avg R²': [np.mean(scores['R²'][m]) for m in models],
'Avg MedAPE (%)': [np.mean(scores['MedAPE'][m]) for m in models],
'Avg SMAPE (%)': [np.mean(scores['SMAPE'][m]) for m in models]
})
print(perf_df.to_string(index=False, float_format="%.4f"))
# === Plot ===
fig, axes = plt.subplots(1, 5, figsize=(30, 6))
for i, metric in enumerate(perf_df.columns[1:]):
sns.barplot(data=perf_df, x='Model', y=metric, ax=axes[i], palette='Set2')
axes[i].set_title(f"{metric} ({k}-Fold)")
axes[i].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
# === Blind Set Evaluation ===
print(f"\n--- Evaluating Blind Set After {k}-Fold ---")
X_blind_scaled = scaler.transform(X_blind_selected)
blind_metrics = {metric: {} for metric in metrics}
for name, model in models.items():
if name == 'LSTM (PyTorch)':
X_blind_seq = X_blind_scaled.reshape(-1, 1, X_blind_scaled.shape[1])
X_blind_tensor = torch.tensor(X_blind_seq, dtype=torch.float32)
lstm.eval()
with torch.no_grad():
y_blind_pred = torch.clamp(lstm(X_blind_tensor), min=0).squeeze().numpy()
else:
y_blind_pred = np.maximum(model.predict(X_blind_scaled), 0)
blind_metrics['RMSE'][name] = np.sqrt(mean_squared_error(rdkit_blind['log_ic50'], y_blind_pred))
blind_metrics['MSE'][name] = mean_squared_error(rdkit_blind['log_ic50'], y_blind_pred)
blind_metrics['R²'][name] = r2_score(rdkit_blind['log_ic50'], y_blind_pred)
blind_metrics['MedAPE'][name] = medape(rdkit_blind['log_ic50'], y_blind_pred)
blind_metrics['SMAPE'][name] = smape(np.expm1(rdkit_blind['log_ic50']), np.expm1(y_blind_pred))
blind_df = pd.DataFrame({
'Model': list(models.keys()),
'RMSE': [blind_metrics['RMSE'][m] for m in models],
'MSE': [blind_metrics['MSE'][m] for m in models],
'R²': [blind_metrics['R²'][m] for m in models],
'MedAPE (%)': [blind_metrics['MedAPE'][m] for m in models],
'SMAPE (%)': [blind_metrics['SMAPE'][m] for m in models],
})
print(blind_df.to_string(index=False, float_format="%.4f"))
# === Blind Set Performance Plot ===
fig, axes = plt.subplots(1, 5, figsize=(30, 6))
for i, metric in enumerate(blind_df.columns[1:]): # Skip 'Model'
sns.barplot(data=blind_df, x='Model', y=metric, ax=axes[i], palette='Set1')
axes[i].set_title(f"{metric} (Blind Set - {k}-Fold)")
axes[i].tick_params(axis='x', rotation=45)
axes[i].set_ylabel('')
axes[i].set_xlabel('')
plt.tight_layout()
plt.show()
--- 2-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.8306 0.6899 0.7167 18.3376 80.0410
Gradient Boosting 0.8394 0.7046 0.7104 18.5265 81.5235
SVM 0.8597 0.7391 0.6962 18.3370 81.5356
LSTM (PyTorch) 0.8386 0.7033 0.7112 17.9457 79.4662
--- Evaluating Blind Set After 2-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8318 0.6920 0.7101 19.2571 81.9812
Gradient Boosting 0.8418 0.7086 0.7031 19.7676 82.9457
SVM 0.8469 0.7172 0.6995 18.2966 82.7228
LSTM (PyTorch) 0.8397 0.7051 0.7046 19.4839 81.9340
--- 3-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.8041 0.6469 0.7343 17.7428 78.9234
Gradient Boosting 0.8129 0.6609 0.7286 18.6595 80.8828
SVM 0.8277 0.6851 0.7187 18.3377 80.3264
LSTM (PyTorch) 0.8292 0.6878 0.7176 17.7991 79.9591
--- Evaluating Blind Set After 3-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8193 0.6712 0.7188 18.3502 80.9031
Gradient Boosting 0.8284 0.6862 0.7126 19.3496 81.6753
SVM 0.8346 0.6965 0.7082 17.2792 81.9315
LSTM (PyTorch) 0.8317 0.6917 0.7103 19.6313 82.3786
--- 4-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7928 0.6287 0.7418 17.5687 77.6203
Gradient Boosting 0.8068 0.6511 0.7324 18.8292 80.4229
SVM 0.8181 0.6694 0.7250 17.5108 79.4088
LSTM (PyTorch) 0.7967 0.6350 0.7393 17.0654 78.4268
--- Evaluating Blind Set After 4-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8133 0.6615 0.7229 17.3580 79.7938
Gradient Boosting 0.8194 0.6714 0.7188 18.9598 81.5839
SVM 0.8332 0.6943 0.7092 17.3507 81.1078
LSTM (PyTorch) 0.8046 0.6473 0.7288 16.3188 79.2607
--- 5-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7903 0.6248 0.7431 17.0714 77.1634
Gradient Boosting 0.8087 0.6542 0.7309 18.1469 80.0850
SVM 0.8126 0.6607 0.7283 17.4019 79.1291
LSTM (PyTorch) 0.8191 0.6712 0.7239 17.6785 78.9956
--- Evaluating Blind Set After 5-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8132 0.6612 0.7230 17.0881 79.0780
Gradient Boosting 0.8185 0.6699 0.7194 18.7750 81.9491
SVM 0.8298 0.6886 0.7115 16.9815 80.3660
LSTM (PyTorch) 0.7998 0.6397 0.7320 17.0147 78.8508
--- 6-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7808 0.6098 0.7490 16.5719 76.5058
Gradient Boosting 0.8066 0.6508 0.7314 18.6352 79.8760
SVM 0.8087 0.6541 0.7304 17.1598 78.9577
LSTM (PyTorch) 0.8172 0.6682 0.7249 17.4682 78.9627
--- Evaluating Blind Set After 6-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.7991 0.6385 0.7325 16.5196 79.0845
Gradient Boosting 0.8157 0.6653 0.7213 17.8290 81.4344
SVM 0.8269 0.6837 0.7136 16.2511 80.0064
LSTM (PyTorch) 0.7854 0.6168 0.7416 17.6698 80.4095
--- 7-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7849 0.6166 0.7464 16.8352 76.3880
Gradient Boosting 0.8107 0.6578 0.7290 18.1164 79.5318
SVM 0.8080 0.6531 0.7310 17.2001 78.6876
LSTM (PyTorch) 0.8138 0.6624 0.7273 17.7269 78.5441
--- Evaluating Blind Set After 7-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.7982 0.6371 0.7331 16.3899 78.7106
Gradient Boosting 0.8190 0.6708 0.7190 17.2375 79.9327
SVM 0.8232 0.6776 0.7161 15.9622 79.5238
LSTM (PyTorch) 0.7763 0.6027 0.7475 16.4765 79.9479
--- 8-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7837 0.6152 0.7472 16.7356 76.1451
Gradient Boosting 0.8052 0.6494 0.7328 18.7734 80.4595
SVM 0.8099 0.6564 0.7299 17.1373 78.6925
LSTM (PyTorch) 0.7919 0.6284 0.7418 17.0691 77.8212
--- Evaluating Blind Set After 8-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8027 0.6444 0.7301 16.4908 78.5860
Gradient Boosting 0.8109 0.6575 0.7246 18.1899 80.6279
SVM 0.8230 0.6774 0.7163 15.9812 79.0867
LSTM (PyTorch) 0.7882 0.6213 0.7397 18.5012 79.3905
--- 9-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7820 0.6124 0.7480 16.6276 76.7069
Gradient Boosting 0.8096 0.6561 0.7294 18.1616 80.1187
SVM 0.8075 0.6526 0.7315 17.2445 78.7770
LSTM (PyTorch) 0.8036 0.6464 0.7338 17.1023 77.8354
--- Evaluating Blind Set After 9-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8008 0.6412 0.7314 16.4448 79.2043
Gradient Boosting 0.8091 0.6546 0.7258 17.4756 79.8383
SVM 0.8194 0.6714 0.7187 16.3089 79.2809
LSTM (PyTorch) 0.7809 0.6098 0.7446 18.5073 80.1949
--- 10-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7838 0.6152 0.7467 16.7042 76.2175
Gradient Boosting 0.8108 0.6585 0.7285 18.3073 80.2463
SVM 0.8075 0.6532 0.7310 17.2731 78.4776
LSTM (PyTorch) 0.8056 0.6503 0.7325 17.5499 78.1882
--- Evaluating Blind Set After 10-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.7983 0.6373 0.7330 16.9109 78.9585
Gradient Boosting 0.8207 0.6736 0.7178 18.0580 80.7082
SVM 0.8197 0.6719 0.7185 16.2833 79.3225
LSTM (PyTorch) 0.7939 0.6303 0.7360 18.6228 81.2217
PubChem¶
In [64]:
print("🧾 PubChem Training Columns:")
print(pubchem_train.columns.tolist())
# Count total columns
total_columns_pubchem = len(pubchem_train.columns)
print(f"\n🔢 Total columns: {total_columns_pubchem}")
🧾 PubChem Training Columns: ['CID', 'MW', 'XLogP3', 'HBDC', 'HDAC', 'RBC', 'ExactMass', 'MonoMass', 'TPSA', 'HAC', 'Charge', 'Complexity', 'IsotopeAtoms', 'DASC', 'UASC', 'DBSC', 'UBSC', 'CBU', 'chembl_id', 'ic50_value', 'log_ic50', 'target_chembl_id', 'target_name'] 🔢 Total columns: 23
In [65]:
# RDKit
rdkit_train, rdkit_blind = split_full_dataset(merged_df_rdkit, descriptor_name="RDKit")
# PubChem
pubchem_train, pubchem_blind = split_full_dataset(merged_df_pubchem, descriptor_name="PubChem")
# PaDEL
padel_train, padel_blind = split_full_dataset(merged_df_padel, descriptor_name="PaDEL")
📊 RDKit Split (All Columns Kept): Training Set: (2760, 223) Blind Set: (691, 223) 📊 PubChem Split (All Columns Kept): Training Set: (2755, 23) Blind Set: (689, 23) 📊 PaDEL Split (All Columns Kept): Training Set: (2340, 1881) Blind Set: (586, 1881)
In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, random_split
# === Metrics ===
def medape(y_true, y_pred, epsilon=1e-3):
y_true, y_pred = np.array(y_true), np.array(y_pred)
mask = y_true > epsilon
return np.median(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100 if np.sum(mask) else np.nan
def smape(y_true, y_pred, epsilon=1e-3):
y_true, y_pred = np.array(y_true), np.array(y_pred)
denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
denominator = np.where(denominator < epsilon, epsilon, denominator)
return np.mean(np.abs(y_true - y_pred) / denominator) * 100
# === LSTM ===
class LSTMRegressor(nn.Module):
def __init__(self, input_size, hidden_size=64, num_layers=2, activation='relu'):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.activation = {'relu': nn.ReLU(), 'tanh': nn.Tanh(), 'sigmoid': nn.Sigmoid(), 'leaky_relu': nn.LeakyReLU()}.get(activation)
self.fc = nn.Linear(hidden_size, 1)
def forward(self, x):
out, _ = self.lstm(x)
out = out[:, -1, :]
return self.fc(self.activation(out)) if self.activation else self.fc(out)
def train_model(model, train_loader, val_loader, epochs=200, patience=20, lr=0.001):
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
best_loss, best_state, patience_counter = float('inf'), None, 0
for epoch in range(epochs):
model.train()
for xb, yb in train_loader:
optimizer.zero_grad()
loss = criterion(model(xb), yb)
loss.backward()
optimizer.step()
model.eval()
val_loss = np.mean([criterion(model(xb), yb).item() for xb, yb in val_loader])
if val_loss < best_loss:
best_loss, best_state = val_loss, model.state_dict()
patience_counter = 0
else:
patience_counter += 1
if patience_counter >= patience:
break
model.load_state_dict(best_state)
return model
# === Prepare Training Set ===
y = pubchem_train['log_ic50']
drop_cols = ['target_chembl_id', 'target_name', 'ic50_value', 'chembl_id', 'log_ic50', 'CID']
X = pubchem_train.drop(columns=[col for col in drop_cols if col in pubchem_train.columns])
y = pubchem_train['log_ic50']
X = pubchem_train.drop(columns=[col for col in drop_cols if col in pubchem_train.columns])
X = X.replace(r'^\s*$', np.nan, regex=True).apply(pd.to_numeric, errors='coerce').fillna(X.mean())
selector = VarianceThreshold(0.01)
X_selected = selector.fit_transform(X)
# === Prepare Blind Set ===
X_blind = pubchem_blind.drop(columns=[col for col in drop_cols if col in pubchem_blind.columns])
X_blind = X_blind.replace(r'^\s*$', np.nan, regex=True).apply(pd.to_numeric, errors='coerce').fillna(X_blind.mean())
X_blind_selected = selector.transform(X_blind)
# === Models ===
models = {
'Random Forest': RandomForestRegressor(**rf_best_params),
'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, random_state=42),
'SVM': SVR(),
'LSTM (PyTorch)': None # will instantiate per fold
}
# === Cross-validation and Blind Evaluation ===
for k in range(2, 11):
print(f"\n--- {k}-Fold Cross Validation ---")
kf = KFold(n_splits=k, shuffle=True, random_state=42)
scores = {metric: {m: [] for m in models} for metric in metrics}
for train_idx, test_idx in kf.split(X_selected):
X_train, X_test = X_selected[train_idx], X_selected[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Classic models
for name, model in models.items():
if name == 'LSTM (PyTorch)':
continue
model.fit(X_train_scaled, y_train)
y_pred = np.maximum(model.predict(X_test_scaled), 0)
scores['RMSE'][name].append(np.sqrt(mean_squared_error(y_test, y_pred)))
scores['MSE'][name].append(mean_squared_error(y_test, y_pred))
scores['R²'][name].append(r2_score(y_test, y_pred))
scores['MedAPE'][name].append(medape(y_test, y_pred))
scores['SMAPE'][name].append(smape(np.expm1(y_test), np.expm1(y_pred)))
# LSTM
X_train_seq = X_train_scaled.reshape(-1, 1, X_train_scaled.shape[1])
X_test_seq = X_test_scaled.reshape(-1, 1, X_test_scaled.shape[1])
train_tensor = torch.tensor(X_train_seq, dtype=torch.float32)
test_tensor = torch.tensor(X_test_seq, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values.reshape(-1, 1), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values.reshape(-1, 1), dtype=torch.float32)
train_ds, val_ds = random_split(TensorDataset(train_tensor, y_train_tensor), [int(0.8*len(train_tensor)), len(train_tensor) - int(0.8*len(train_tensor))])
train_loader = DataLoader(train_ds, batch_size=lstm_best_params['batch_size'], shuffle=True)
val_loader = DataLoader(val_ds, batch_size=lstm_best_params['batch_size'])
lstm = LSTMRegressor(
input_size=train_tensor.shape[2],
hidden_size=lstm_best_params['hidden_size'],
num_layers=lstm_best_params['num_layers'],
activation=lstm_best_params['activation']
)
lstm = train_model(lstm, train_loader, val_loader, lr=lstm_best_params['learning_rate'])
lstm.eval()
with torch.no_grad():
y_pred_lstm = torch.clamp(lstm(test_tensor), min=0).squeeze().numpy()
scores['RMSE']['LSTM (PyTorch)'].append(np.sqrt(mean_squared_error(y_test, y_pred_lstm)))
scores['MSE']['LSTM (PyTorch)'].append(mean_squared_error(y_test, y_pred_lstm))
scores['R²']['LSTM (PyTorch)'].append(r2_score(y_test, y_pred_lstm))
scores['MedAPE']['LSTM (PyTorch)'].append(medape(y_test, y_pred_lstm))
scores['SMAPE']['LSTM (PyTorch)'].append(smape(np.expm1(y_test), np.expm1(y_pred_lstm)))
# === Fold-wise Average Report ===
print("\nAverage Cross-Validation Performance:")
perf_df = pd.DataFrame({
'Model': list(models.keys()),
'Avg RMSE': [np.mean(scores['RMSE'][m]) for m in models],
'Avg MSE': [np.mean(scores['MSE'][m]) for m in models],
'Avg R²': [np.mean(scores['R²'][m]) for m in models],
'Avg MedAPE (%)': [np.mean(scores['MedAPE'][m]) for m in models],
'Avg SMAPE (%)': [np.mean(scores['SMAPE'][m]) for m in models]
})
print(perf_df.to_string(index=False, float_format="%.4f"))
# === Plot ===
fig, axes = plt.subplots(1, 5, figsize=(30, 6))
for i, metric in enumerate(perf_df.columns[1:]):
sns.barplot(data=perf_df, x='Model', y=metric, ax=axes[i], palette='Set2')
axes[i].set_title(f"{metric} ({k}-Fold)")
axes[i].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
# === Blind Set Evaluation ===
print(f"\n--- Evaluating Blind Set After {k}-Fold ---")
X_blind_scaled = scaler.transform(X_blind_selected)
blind_metrics = {metric: {} for metric in metrics}
for name, model in models.items():
if name == 'LSTM (PyTorch)':
X_blind_seq = X_blind_scaled.reshape(-1, 1, X_blind_scaled.shape[1])
X_blind_tensor = torch.tensor(X_blind_seq, dtype=torch.float32)
lstm.eval()
with torch.no_grad():
y_blind_pred = torch.clamp(lstm(X_blind_tensor), min=0).squeeze().numpy()
else:
y_blind_pred = np.maximum(model.predict(X_blind_scaled), 0)
blind_metrics['RMSE'][name] = np.sqrt(mean_squared_error(pubchem_blind['log_ic50'], y_blind_pred))
blind_metrics['MSE'][name] = mean_squared_error(pubchem_blind['log_ic50'], y_blind_pred)
blind_metrics['R²'][name] = r2_score(pubchem_blind['log_ic50'], y_blind_pred)
blind_metrics['MedAPE'][name] = medape(pubchem_blind['log_ic50'], y_blind_pred)
blind_metrics['SMAPE'][name] = smape(np.expm1(pubchem_blind['log_ic50']), np.expm1(y_blind_pred))
blind_df = pd.DataFrame({
'Model': list(models.keys()),
'RMSE': [blind_metrics['RMSE'][m] for m in models],
'MSE': [blind_metrics['MSE'][m] for m in models],
'R²': [blind_metrics['R²'][m] for m in models],
'MedAPE (%)': [blind_metrics['MedAPE'][m] for m in models],
'SMAPE (%)': [blind_metrics['SMAPE'][m] for m in models],
})
print(blind_df.to_string(index=False, float_format="%.4f"))
# === Blind Set Performance Plot ===
fig, axes = plt.subplots(1, 5, figsize=(30, 6))
for i, metric in enumerate(blind_df.columns[1:]): # Skip 'Model'
sns.barplot(data=blind_df, x='Model', y=metric, ax=axes[i], palette='Set1')
axes[i].set_title(f"{metric} (Blind Set - {k}-Fold)")
axes[i].tick_params(axis='x', rotation=45)
axes[i].set_ylabel('')
axes[i].set_xlabel('')
plt.tight_layout()
plt.show()
--- 2-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.9650 0.9316 0.6207 22.1764 88.0600
Gradient Boosting 1.0593 1.1227 0.5428 24.2731 91.7890
SVM 1.1779 1.3877 0.4350 26.4215 94.9156
LSTM (PyTorch) 1.0976 1.2048 0.5095 25.6699 93.8234
--- Evaluating Blind Set After 2-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.9238 0.8535 0.6310 21.7753 83.1534
Gradient Boosting 1.0137 1.0277 0.5557 23.2788 88.7054
SVM 1.1483 1.3186 0.4299 25.7887 90.6058
LSTM (PyTorch) 1.0529 1.1085 0.5207 23.8776 89.1002
--- 3-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.9562 0.9155 0.6272 20.8467 86.6917
Gradient Boosting 1.0519 1.1076 0.5491 23.7445 92.2441
SVM 1.1552 1.3377 0.4564 26.0198 93.9440
LSTM (PyTorch) 1.0609 1.1276 0.5415 24.4133 91.9217
--- Evaluating Blind Set After 3-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.9029 0.8153 0.6475 20.6366 81.7045
Gradient Boosting 0.9741 0.9488 0.5897 21.6789 86.7245
SVM 1.1325 1.2825 0.4455 24.9998 91.0851
LSTM (PyTorch) 1.0201 1.0406 0.5501 22.8982 87.2315
--- 4-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.9296 0.8648 0.6472 20.4790 85.3181
Gradient Boosting 1.0468 1.0970 0.5525 24.5865 92.2225
SVM 1.1515 1.3284 0.4580 25.8194 93.7500
LSTM (PyTorch) 1.0442 1.0912 0.5551 24.5830 92.0172
--- Evaluating Blind Set After 4-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8921 0.7958 0.6559 19.4110 80.9251
Gradient Boosting 0.9767 0.9539 0.5876 23.0535 86.3471
SVM 1.1338 1.2856 0.4441 25.2926 90.8776
LSTM (PyTorch) 1.0018 1.0036 0.5661 22.4751 87.4091
--- 5-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.9310 0.8686 0.6461 20.5735 84.8339
Gradient Boosting 1.0452 1.0955 0.5536 24.5220 92.2092
SVM 1.1493 1.3237 0.4612 25.7936 93.4915
LSTM (PyTorch) 1.0455 1.0963 0.5538 23.5374 91.0706
--- Evaluating Blind Set After 5-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8812 0.7766 0.6642 19.1285 80.3259
Gradient Boosting 0.9869 0.9739 0.5789 22.7855 87.0353
SVM 1.1397 1.2988 0.4384 25.2972 90.6792
LSTM (PyTorch) 1.0114 1.0230 0.5577 22.6097 87.1144
--- 6-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.9220 0.8515 0.6520 19.9976 84.7819
Gradient Boosting 1.0515 1.1075 0.5479 24.2694 92.5525
SVM 1.1456 1.3161 0.4635 26.0027 93.4025
LSTM (PyTorch) 1.0398 1.0840 0.5583 23.2962 91.1565
--- Evaluating Blind Set After 6-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8837 0.7809 0.6624 19.4085 80.4159
Gradient Boosting 0.9939 0.9879 0.5728 23.4599 87.9159
SVM 1.1314 1.2800 0.4465 24.9547 90.2268
LSTM (PyTorch) 0.9968 0.9935 0.5704 23.2269 87.3949
--- 7-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.9171 0.8426 0.6565 20.6343 84.3038
Gradient Boosting 1.0408 1.0844 0.5578 23.9926 92.2280
SVM 1.1393 1.3002 0.4702 25.2941 93.0221
LSTM (PyTorch) 1.0361 1.0741 0.5621 23.6958 91.3822
--- Evaluating Blind Set After 7-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8793 0.7731 0.6657 19.3000 79.4940
Gradient Boosting 0.9874 0.9749 0.5785 22.7668 86.8556
SVM 1.1320 1.2813 0.4460 24.7950 90.2792
LSTM (PyTorch) 1.0080 1.0161 0.5606 23.0166 88.0377
--- 8-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.9207 0.8495 0.6510 20.0383 84.5432
Gradient Boosting 1.0436 1.0909 0.5518 24.4609 92.3968
SVM 1.1430 1.3090 0.4634 25.4500 93.2093
LSTM (PyTorch) 1.0418 1.0879 0.5538 23.9747 91.2961
--- Evaluating Blind Set After 8-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8884 0.7893 0.6587 18.9798 80.4385
Gradient Boosting 1.0014 1.0029 0.5664 22.4221 87.5089
SVM 1.1303 1.2776 0.4476 24.6660 90.1613
LSTM (PyTorch) 0.9986 0.9972 0.5688 21.6298 85.7673
--- 9-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.9141 0.8387 0.6582 20.3681 84.1366
Gradient Boosting 1.0378 1.0811 0.5595 24.3002 91.9980
SVM 1.1395 1.3035 0.4700 25.7233 93.1983
LSTM (PyTorch) 1.0321 1.0707 0.5645 23.4127 90.6850
--- Evaluating Blind Set After 9-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8823 0.7785 0.6634 19.6452 80.5561
Gradient Boosting 0.9898 0.9797 0.5764 21.9740 86.8337
SVM 1.1311 1.2794 0.4468 24.0113 90.0334
LSTM (PyTorch) 1.0074 1.0149 0.5612 23.6077 88.2959
--- 10-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.9133 0.8383 0.6567 20.2510 84.1582
Gradient Boosting 1.0358 1.0783 0.5587 24.3003 92.2076
SVM 1.1393 1.3037 0.4668 25.4167 93.0835
LSTM (PyTorch) 1.0370 1.0822 0.5576 23.7977 91.1289
--- Evaluating Blind Set After 10-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8833 0.7803 0.6626 18.4689 80.4102
Gradient Boosting 0.9923 0.9846 0.5743 23.1634 87.3988
SVM 1.1289 1.2744 0.4490 24.5147 90.0608
LSTM (PyTorch) 1.0125 1.0251 0.5568 23.2636 88.1259
In [ ]:
PaDEL¶
In [68]:
#padel_train, padel_blind = split_full_dataset(merged_df_padel, descriptor_name="PaDEL")
In [69]:
print("🧾 RDKIT Training Columns:")
print(padel_train.columns.tolist())
# Count total columns
total_columns_padel = len(padel_train.columns)
print(f"\n🔢 Total columns: {total_columns_padel}")
🧾 RDKIT Training Columns: ['Name', 'nAcid', 'ALogP', 'ALogp2', 'AMR', 'apol', 'naAromAtom', 'nAromBond', 'nAtom', 'nHeavyAtom', 'nH', 'nB', 'nC', 'nN', 'nO', 'nS', 'nP', 'nF', 'nCl', 'nBr', 'nI', 'nX', 'ATS0m', 'ATS1m', 'ATS2m', 'ATS3m', 'ATS4m', 'ATS5m', 'ATS6m', 'ATS7m', 'ATS8m', 'ATS0v', 'ATS1v', 'ATS2v', 'ATS3v', 'ATS4v', 'ATS5v', 'ATS6v', 'ATS7v', 'ATS8v', 'ATS0e', 'ATS1e', 'ATS2e', 'ATS3e', 'ATS4e', 'ATS5e', 'ATS6e', 'ATS7e', 'ATS8e', 'ATS0p', 'ATS1p', 'ATS2p', 'ATS3p', 'ATS4p', 'ATS5p', 'ATS6p', 'ATS7p', 'ATS8p', 'ATS0i', 'ATS1i', 'ATS2i', 'ATS3i', 'ATS4i', 'ATS5i', 'ATS6i', 'ATS7i', 'ATS8i', 'ATS0s', 'ATS1s', 'ATS2s', 'ATS3s', 'ATS4s', 'ATS5s', 'ATS6s', 'ATS7s', 'ATS8s', 'AATS0m', 'AATS1m', 'AATS2m', 'AATS3m', 'AATS4m', 'AATS5m', 'AATS6m', 'AATS7m', 'AATS8m', 'AATS0v', 'AATS1v', 'AATS2v', 'AATS3v', 'AATS4v', 'AATS5v', 'AATS6v', 'AATS7v', 'AATS8v', 'AATS0e', 'AATS1e', 'AATS2e', 'AATS3e', 'AATS4e', 'AATS5e', 'AATS6e', 'AATS7e', 'AATS8e', 'AATS0p', 'AATS1p', 'AATS2p', 'AATS3p', 'AATS4p', 'AATS5p', 'AATS6p', 'AATS7p', 'AATS8p', 'AATS0i', 'AATS1i', 'AATS2i', 'AATS3i', 'AATS4i', 'AATS5i', 'AATS6i', 'AATS7i', 'AATS8i', 'AATS0s', 'AATS1s', 'AATS2s', 'AATS3s', 'AATS4s', 'AATS5s', 'AATS6s', 'AATS7s', 'AATS8s', 'ATSC0c', 'ATSC1c', 'ATSC2c', 'ATSC3c', 'ATSC4c', 'ATSC5c', 'ATSC6c', 'ATSC7c', 'ATSC8c', 'ATSC0m', 'ATSC1m', 'ATSC2m', 'ATSC3m', 'ATSC4m', 'ATSC5m', 'ATSC6m', 'ATSC7m', 'ATSC8m', 'ATSC0v', 'ATSC1v', 'ATSC2v', 'ATSC3v', 'ATSC4v', 'ATSC5v', 'ATSC6v', 'ATSC7v', 'ATSC8v', 'ATSC0e', 'ATSC1e', 'ATSC2e', 'ATSC3e', 'ATSC4e', 'ATSC5e', 'ATSC6e', 'ATSC7e', 'ATSC8e', 'ATSC0p', 'ATSC1p', 'ATSC2p', 'ATSC3p', 'ATSC4p', 'ATSC5p', 'ATSC6p', 'ATSC7p', 'ATSC8p', 'ATSC0i', 'ATSC1i', 'ATSC2i', 'ATSC3i', 'ATSC4i', 'ATSC5i', 'ATSC6i', 'ATSC7i', 'ATSC8i', 'ATSC0s', 'ATSC1s', 'ATSC2s', 'ATSC3s', 'ATSC4s', 'ATSC5s', 'ATSC6s', 'ATSC7s', 'ATSC8s', 'AATSC0c', 'AATSC1c', 'AATSC2c', 'AATSC3c', 'AATSC4c', 'AATSC5c', 'AATSC6c', 'AATSC7c', 'AATSC8c', 'AATSC0m', 'AATSC1m', 'AATSC2m', 'AATSC3m', 'AATSC4m', 'AATSC5m', 'AATSC6m', 'AATSC7m', 'AATSC8m', 'AATSC0v', 'AATSC1v', 'AATSC2v', 'AATSC3v', 'AATSC4v', 'AATSC5v', 'AATSC6v', 'AATSC7v', 'AATSC8v', 'AATSC0e', 'AATSC1e', 'AATSC2e', 'AATSC3e', 'AATSC4e', 'AATSC5e', 'AATSC6e', 'AATSC7e', 'AATSC8e', 'AATSC0p', 'AATSC1p', 'AATSC2p', 'AATSC3p', 'AATSC4p', 'AATSC5p', 'AATSC6p', 'AATSC7p', 'AATSC8p', 'AATSC0i', 'AATSC1i', 'AATSC2i', 'AATSC3i', 'AATSC4i', 'AATSC5i', 'AATSC6i', 'AATSC7i', 'AATSC8i', 'AATSC0s', 'AATSC1s', 'AATSC2s', 'AATSC3s', 'AATSC4s', 'AATSC5s', 'AATSC6s', 'AATSC7s', 'AATSC8s', 'MATS1c', 'MATS2c', 'MATS3c', 'MATS4c', 'MATS5c', 'MATS6c', 'MATS7c', 'MATS8c', 'MATS1m', 'MATS2m', 'MATS3m', 'MATS4m', 'MATS5m', 'MATS6m', 'MATS7m', 'MATS8m', 'MATS1v', 'MATS2v', 'MATS3v', 'MATS4v', 'MATS5v', 'MATS6v', 'MATS7v', 'MATS8v', 'MATS1e', 'MATS2e', 'MATS3e', 'MATS4e', 'MATS5e', 'MATS6e', 'MATS7e', 'MATS8e', 'MATS1p', 'MATS2p', 'MATS3p', 'MATS4p', 'MATS5p', 'MATS6p', 'MATS7p', 'MATS8p', 'MATS1i', 'MATS2i', 'MATS3i', 'MATS4i', 'MATS5i', 'MATS6i', 'MATS7i', 'MATS8i', 'MATS1s', 'MATS2s', 'MATS3s', 'MATS4s', 'MATS5s', 'MATS6s', 'MATS7s', 'MATS8s', 'GATS1c', 'GATS2c', 'GATS3c', 'GATS4c', 'GATS5c', 'GATS6c', 'GATS7c', 'GATS8c', 'GATS1m', 'GATS2m', 'GATS3m', 'GATS4m', 'GATS5m', 'GATS6m', 'GATS7m', 'GATS8m', 'GATS1v', 'GATS2v', 'GATS3v', 'GATS4v', 'GATS5v', 'GATS6v', 'GATS7v', 'GATS8v', 'GATS1e', 'GATS2e', 'GATS3e', 'GATS4e', 'GATS5e', 'GATS6e', 'GATS7e', 'GATS8e', 'GATS1p', 'GATS2p', 'GATS3p', 'GATS4p', 'GATS5p', 'GATS6p', 'GATS7p', 'GATS8p', 'GATS1i', 'GATS2i', 'GATS3i', 'GATS4i', 'GATS5i', 'GATS6i', 'GATS7i', 'GATS8i', 'GATS1s', 'GATS2s', 'GATS3s', 'GATS4s', 'GATS5s', 'GATS6s', 'GATS7s', 'GATS8s', 'SpAbs_DzZ', 'SpMax_DzZ', 'SpDiam_DzZ', 'SpAD_DzZ', 'SpMAD_DzZ', 'EE_DzZ', 'SM1_DzZ', 'VE1_DzZ', 'VE2_DzZ', 'VE3_DzZ', 'VR1_DzZ', 'VR2_DzZ', 'VR3_DzZ', 'SpAbs_Dzm', 'SpMax_Dzm', 'SpDiam_Dzm', 'SpAD_Dzm', 'SpMAD_Dzm', 'EE_Dzm', 'SM1_Dzm', 'VE1_Dzm', 'VE2_Dzm', 'VE3_Dzm', 'VR1_Dzm', 'VR2_Dzm', 'VR3_Dzm', 'SpAbs_Dzv', 'SpMax_Dzv', 'SpDiam_Dzv', 'SpAD_Dzv', 'SpMAD_Dzv', 'EE_Dzv', 'SM1_Dzv', 'VE1_Dzv', 'VE2_Dzv', 'VE3_Dzv', 'VR1_Dzv', 'VR2_Dzv', 'VR3_Dzv', 'SpAbs_Dze', 'SpMax_Dze', 'SpDiam_Dze', 'SpAD_Dze', 'SpMAD_Dze', 'EE_Dze', 'SM1_Dze', 'VE1_Dze', 'VE2_Dze', 'VE3_Dze', 'VR1_Dze', 'VR2_Dze', 'VR3_Dze', 'SpAbs_Dzp', 'SpMax_Dzp', 'SpDiam_Dzp', 'SpAD_Dzp', 'SpMAD_Dzp', 'EE_Dzp', 'SM1_Dzp', 'VE1_Dzp', 'VE2_Dzp', 'VE3_Dzp', 'VR1_Dzp', 'VR2_Dzp', 'VR3_Dzp', 'SpAbs_Dzi', 'SpMax_Dzi', 'SpDiam_Dzi', 'SpAD_Dzi', 'SpMAD_Dzi', 'EE_Dzi', 'SM1_Dzi', 'VE1_Dzi', 'VE2_Dzi', 'VE3_Dzi', 'VR1_Dzi', 'VR2_Dzi', 'VR3_Dzi', 'SpAbs_Dzs', 'SpMax_Dzs', 'SpDiam_Dzs', 'SpAD_Dzs', 'SpMAD_Dzs', 'EE_Dzs', 'SM1_Dzs', 'VE1_Dzs', 'VE2_Dzs', 'VE3_Dzs', 'VR1_Dzs', 'VR2_Dzs', 'VR3_Dzs', 'nBase', 'BCUTw-1l', 'BCUTw-1h', 'BCUTc-1l', 'BCUTc-1h', 'BCUTp-1l', 'BCUTp-1h', 'nBonds', 'nBonds2', 'nBondsS', 'nBondsS2', 'nBondsS3', 'nBondsD', 'nBondsD2', 'nBondsT', 'nBondsQ', 'nBondsM', 'bpol', 'SpMax1_Bhm', 'SpMax2_Bhm', 'SpMax3_Bhm', 'SpMax4_Bhm', 'SpMax5_Bhm', 'SpMax6_Bhm', 'SpMax7_Bhm', 'SpMax8_Bhm', 'SpMin1_Bhm', 'SpMin2_Bhm', 'SpMin3_Bhm', 'SpMin4_Bhm', 'SpMin5_Bhm', 'SpMin6_Bhm', 'SpMin7_Bhm', 'SpMin8_Bhm', 'SpMax1_Bhv', 'SpMax2_Bhv', 'SpMax3_Bhv', 'SpMax4_Bhv', 'SpMax5_Bhv', 'SpMax6_Bhv', 'SpMax7_Bhv', 'SpMax8_Bhv', 'SpMin1_Bhv', 'SpMin2_Bhv', 'SpMin3_Bhv', 'SpMin4_Bhv', 'SpMin5_Bhv', 'SpMin6_Bhv', 'SpMin7_Bhv', 'SpMin8_Bhv', 'SpMax1_Bhe', 'SpMax2_Bhe', 'SpMax3_Bhe', 'SpMax4_Bhe', 'SpMax5_Bhe', 'SpMax6_Bhe', 'SpMax7_Bhe', 'SpMax8_Bhe', 'SpMin1_Bhe', 'SpMin2_Bhe', 'SpMin3_Bhe', 'SpMin4_Bhe', 'SpMin5_Bhe', 'SpMin6_Bhe', 'SpMin7_Bhe', 'SpMin8_Bhe', 'SpMax1_Bhp', 'SpMax2_Bhp', 'SpMax3_Bhp', 'SpMax4_Bhp', 'SpMax5_Bhp', 'SpMax6_Bhp', 'SpMax7_Bhp', 'SpMax8_Bhp', 'SpMin1_Bhp', 'SpMin2_Bhp', 'SpMin3_Bhp', 'SpMin4_Bhp', 'SpMin5_Bhp', 'SpMin6_Bhp', 'SpMin7_Bhp', 'SpMin8_Bhp', 'SpMax1_Bhi', 'SpMax2_Bhi', 'SpMax3_Bhi', 'SpMax4_Bhi', 'SpMax5_Bhi', 'SpMax6_Bhi', 'SpMax7_Bhi', 'SpMax8_Bhi', 'SpMin1_Bhi', 'SpMin2_Bhi', 'SpMin3_Bhi', 'SpMin4_Bhi', 'SpMin5_Bhi', 'SpMin6_Bhi', 'SpMin7_Bhi', 'SpMin8_Bhi', 'SpMax1_Bhs', 'SpMax2_Bhs', 'SpMax3_Bhs', 'SpMax4_Bhs', 'SpMax5_Bhs', 'SpMax6_Bhs', 'SpMax7_Bhs', 'SpMax8_Bhs', 'SpMin1_Bhs', 'SpMin2_Bhs', 'SpMin3_Bhs', 'SpMin4_Bhs', 'SpMin5_Bhs', 'SpMin6_Bhs', 'SpMin7_Bhs', 'SpMin8_Bhs', 'C1SP1', 'C2SP1', 'C1SP2', 'C2SP2', 'C3SP2', 'C1SP3', 'C2SP3', 'C3SP3', 'C4SP3', 'SCH-3', 'SCH-4', 'SCH-5', 'SCH-6', 'SCH-7', 'VCH-3', 'VCH-4', 'VCH-5', 'VCH-6', 'VCH-7', 'SC-3', 'SC-4', 'SC-5', 'SC-6', 'VC-3', 'VC-4', 'VC-5', 'VC-6', 'SPC-4', 'SPC-5', 'SPC-6', 'VPC-4', 'VPC-5', 'VPC-6', 'SP-0', 'SP-1', 'SP-2', 'SP-3', 'SP-4', 'SP-5', 'SP-6', 'SP-7', 'ASP-0', 'ASP-1', 'ASP-2', 'ASP-3', 'ASP-4', 'ASP-5', 'ASP-6', 'ASP-7', 'VP-0', 'VP-1', 'VP-2', 'VP-3', 'VP-4', 'VP-5', 'VP-6', 'VP-7', 'AVP-0', 'AVP-1', 'AVP-2', 'AVP-3', 'AVP-4', 'AVP-5', 'AVP-6', 'AVP-7', 'Sv', 'Sse', 'Spe', 'Sare', 'Sp', 'Si', 'Mv', 'Mse', 'Mpe', 'Mare', 'Mp', 'Mi', 'CrippenLogP', 'CrippenMR', 'SpMax_Dt', 'SpDiam_Dt', 'SpAD_Dt', 'SpMAD_Dt', 'EE_Dt', 'VE1_Dt', 'VE2_Dt', 'VE3_Dt', 'VR1_Dt', 'VR2_Dt', 'VR3_Dt', 'ECCEN', 'nHBd', 'nwHBd', 'nHBa', 'nwHBa', 'nHBint2', 'nHBint3', 'nHBint4', 'nHBint5', 'nHBint6', 'nHBint7', 'nHBint8', 'nHBint9', 'nHBint10', 'nHsOH', 'nHdNH', 'nHsSH', 'nHsNH2', 'nHssNH', 'nHaaNH', 'nHsNH3p', 'nHssNH2p', 'nHsssNHp', 'nHtCH', 'nHdCH2', 'nHdsCH', 'nHaaCH', 'nHCHnX', 'nHCsats', 'nHCsatu', 'nHAvin', 'nHother', 'nHmisc', 'nsLi', 'nssBe', 'nssssBem', 'nsBH2', 'nssBH', 'nsssB', 'nssssBm', 'nsCH3', 'ndCH2', 'nssCH2', 'ntCH', 'ndsCH', 'naaCH', 'nsssCH', 'nddC', 'ntsC', 'ndssC', 'naasC', 'naaaC', 'nssssC', 'nsNH3p', 'nsNH2', 'nssNH2p', 'ndNH', 'nssNH', 'naaNH', 'ntN', 'nsssNHp', 'ndsN', 'naaN', 'nsssN', 'nddsN', 'naasN', 'nssssNp', 'nsOH', 'ndO', 'nssO', 'naaO', 'naOm', 'nsOm', 'nsF', 'nsSiH3', 'nssSiH2', 'nsssSiH', 'nssssSi', 'nsPH2', 'nssPH', 'nsssP', 'ndsssP', 'nddsP', 'nsssssP', 'nsSH', 'ndS', 'nssS', 'naaS', 'ndssS', 'nddssS', 'nssssssS', 'nSm', 'nsCl', 'nsGeH3', 'nssGeH2', 'nsssGeH', 'nssssGe', 'nsAsH2', 'nssAsH', 'nsssAs', 'ndsssAs', 'nddsAs', 'nsssssAs', 'nsSeH', 'ndSe', 'nssSe', 'naaSe', 'ndssSe', 'nssssssSe', 'nddssSe', 'nsBr', 'nsSnH3', 'nssSnH2', 'nsssSnH', 'nssssSn', 'nsI', 'nsPbH3', 'nssPbH2', 'nsssPbH', 'nssssPb', 'SHBd', 'SwHBd', 'SHBa', 'SwHBa', 'SHBint2', 'SHBint3', 'SHBint4', 'SHBint5', 'SHBint6', 'SHBint7', 'SHBint8', 'SHBint9', 'SHBint10', 'SHsOH', 'SHdNH', 'SHsSH', 'SHsNH2', 'SHssNH', 'SHaaNH', 'SHsNH3p', 'SHssNH2p', 'SHsssNHp', 'SHtCH', 'SHdCH2', 'SHdsCH', 'SHaaCH', 'SHCHnX', 'SHCsats', 'SHCsatu', 'SHAvin', 'SHother', 'SHmisc', 'SsLi', 'SssBe', 'SssssBem', 'SsBH2', 'SssBH', 'SsssB', 'SssssBm', 'SsCH3', 'SdCH2', 'SssCH2', 'StCH', 'SdsCH', 'SaaCH', 'SsssCH', 'SddC', 'StsC', 'SdssC', 'SaasC', 'SaaaC', 'SssssC', 'SsNH3p', 'SsNH2', 'SssNH2p', 'SdNH', 'SssNH', 'SaaNH', 'StN', 'SsssNHp', 'SdsN', 'SaaN', 'SsssN', 'SddsN', 'SaasN', 'SssssNp', 'SsOH', 'SdO', 'SssO', 'SaaO', 'SaOm', 'SsOm', 'SsF', 'SsSiH3', 'SssSiH2', 'SsssSiH', 'SssssSi', 'SsPH2', 'SssPH', 'SsssP', 'SdsssP', 'SddsP', 'SsssssP', 'SsSH', 'SdS', 'SssS', 'SaaS', 'SdssS', 'SddssS', 'SssssssS', 'SSm', 'SsCl', 'SsGeH3', 'SssGeH2', 'SsssGeH', 'SssssGe', 'SsAsH2', 'SssAsH', 'SsssAs', 'SdsssAs', 'SddsAs', 'SsssssAs', 'SsSeH', 'SdSe', 'SssSe', 'SaaSe', 'SdssSe', 'SssssssSe', 'SddssSe', 'SsBr', 'SsSnH3', 'SssSnH2', 'SsssSnH', 'SssssSn', 'SsI', 'SsPbH3', 'SssPbH2', 'SsssPbH', 'SssssPb', 'minHBd', 'minwHBd', 'minHBa', 'minwHBa', 'minHBint2', 'minHBint3', 'minHBint4', 'minHBint5', 'minHBint6', 'minHBint7', 'minHBint8', 'minHBint9', 'minHBint10', 'minHsOH', 'minHdNH', 'minHsSH', 'minHsNH2', 'minHssNH', 'minHaaNH', 'minHsNH3p', 'minHssNH2p', 'minHsssNHp', 'minHtCH', 'minHdCH2', 'minHdsCH', 'minHaaCH', 'minHCHnX', 'minHCsats', 'minHCsatu', 'minHAvin', 'minHother', 'minHmisc', 'minsLi', 'minssBe', 'minssssBem', 'minsBH2', 'minssBH', 'minsssB', 'minssssBm', 'minsCH3', 'mindCH2', 'minssCH2', 'mintCH', 'mindsCH', 'minaaCH', 'minsssCH', 'minddC', 'mintsC', 'mindssC', 'minaasC', 'minaaaC', 'minssssC', 'minsNH3p', 'minsNH2', 'minssNH2p', 'mindNH', 'minssNH', 'minaaNH', 'mintN', 'minsssNHp', 'mindsN', 'minaaN', 'minsssN', 'minddsN', 'minaasN', 'minssssNp', 'minsOH', 'mindO', 'minssO', 'minaaO', 'minaOm', 'minsOm', 'minsF', 'minsSiH3', 'minssSiH2', 'minsssSiH', 'minssssSi', 'minsPH2', 'minssPH', 'minsssP', 'mindsssP', 'minddsP', 'minsssssP', 'minsSH', 'mindS', 'minssS', 'minaaS', 'mindssS', 'minddssS', 'minssssssS', 'minSm', 'minsCl', 'minsGeH3', 'minssGeH2', 'minsssGeH', 'minssssGe', 'minsAsH2', 'minssAsH', 'minsssAs', 'mindsssAs', 'minddsAs', 'minsssssAs', 'minsSeH', 'mindSe', 'minssSe', 'minaaSe', 'mindssSe', 'minssssssSe', 'minddssSe', 'minsBr', 'minsSnH3', 'minssSnH2', 'minsssSnH', 'minssssSn', 'minsI', 'minsPbH3', 'minssPbH2', 'minsssPbH', 'minssssPb', 'maxHBd', 'maxwHBd', 'maxHBa', 'maxwHBa', 'maxHBint2', 'maxHBint3', 'maxHBint4', 'maxHBint5', 'maxHBint6', 'maxHBint7', 'maxHBint8', 'maxHBint9', 'maxHBint10', 'maxHsOH', 'maxHdNH', 'maxHsSH', 'maxHsNH2', 'maxHssNH', 'maxHaaNH', 'maxHsNH3p', 'maxHssNH2p', 'maxHsssNHp', 'maxHtCH', 'maxHdCH2', 'maxHdsCH', 'maxHaaCH', 'maxHCHnX', 'maxHCsats', 'maxHCsatu', 'maxHAvin', 'maxHother', 'maxHmisc', 'maxsLi', 'maxssBe', 'maxssssBem', 'maxsBH2', 'maxssBH', 'maxsssB', 'maxssssBm', 'maxsCH3', 'maxdCH2', 'maxssCH2', 'maxtCH', 'maxdsCH', 'maxaaCH', 'maxsssCH', 'maxddC', 'maxtsC', 'maxdssC', 'maxaasC', 'maxaaaC', 'maxssssC', 'maxsNH3p', 'maxsNH2', 'maxssNH2p', 'maxdNH', 'maxssNH', 'maxaaNH', 'maxtN', 'maxsssNHp', 'maxdsN', 'maxaaN', 'maxsssN', 'maxddsN', 'maxaasN', 'maxssssNp', 'maxsOH', 'maxdO', 'maxssO', 'maxaaO', 'maxaOm', 'maxsOm', 'maxsF', 'maxsSiH3', 'maxssSiH2', 'maxsssSiH', 'maxssssSi', 'maxsPH2', 'maxssPH', 'maxsssP', 'maxdsssP', 'maxddsP', 'maxsssssP', 'maxsSH', 'maxdS', 'maxssS', 'maxaaS', 'maxdssS', 'maxddssS', 'maxssssssS', 'maxSm', 'maxsCl', 'maxsGeH3', 'maxssGeH2', 'maxsssGeH', 'maxssssGe', 'maxsAsH2', 'maxssAsH', 'maxsssAs', 'maxdsssAs', 'maxddsAs', 'maxsssssAs', 'maxsSeH', 'maxdSe', 'maxssSe', 'maxaaSe', 'maxdssSe', 'maxssssssSe', 'maxddssSe', 'maxsBr', 'maxsSnH3', 'maxssSnH2', 'maxsssSnH', 'maxssssSn', 'maxsI', 'maxsPbH3', 'maxssPbH2', 'maxsssPbH', 'maxssssPb', 'sumI', 'meanI', 'hmax', 'gmax', 'hmin', 'gmin', 'LipoaffinityIndex', 'MAXDN', 'MAXDP', 'DELS', 'MAXDN2', 'MAXDP2', 'DELS2', 'ETA_Alpha', 'ETA_AlphaP', 'ETA_dAlpha_A', 'ETA_dAlpha_B', 'ETA_Epsilon_1', 'ETA_Epsilon_2', 'ETA_Epsilon_3', 'ETA_Epsilon_4', 'ETA_Epsilon_5', 'ETA_dEpsilon_A', 'ETA_dEpsilon_B', 'ETA_dEpsilon_C', 'ETA_dEpsilon_D', 'ETA_Psi_1', 'ETA_dPsi_A', 'ETA_dPsi_B', 'ETA_Shape_P', 'ETA_Shape_Y', 'ETA_Shape_X', 'ETA_Beta', 'ETA_BetaP', 'ETA_Beta_s', 'ETA_BetaP_s', 'ETA_Beta_ns', 'ETA_BetaP_ns', 'ETA_dBeta', 'ETA_dBetaP', 'ETA_Beta_ns_d', 'ETA_BetaP_ns_d', 'ETA_Eta', 'ETA_EtaP', 'ETA_Eta_R', 'ETA_Eta_F', 'ETA_EtaP_F', 'ETA_Eta_L', 'ETA_EtaP_L', 'ETA_Eta_R_L', 'ETA_Eta_F_L', 'ETA_EtaP_F_L', 'ETA_Eta_B', 'ETA_EtaP_B', 'ETA_Eta_B_RC', 'ETA_EtaP_B_RC', 'FMF', 'fragC', 'nHBAcc', 'nHBAcc2', 'nHBAcc3', 'nHBAcc_Lipinski', 'nHBDon', 'nHBDon_Lipinski', 'HybRatio', 'IC0', 'IC1', 'IC2', 'IC3', 'IC4', 'IC5', 'TIC0', 'TIC1', 'TIC2', 'TIC3', 'TIC4', 'TIC5', 'SIC0', 'SIC1', 'SIC2', 'SIC3', 'SIC4', 'SIC5', 'CIC0', 'CIC1', 'CIC2', 'CIC3', 'CIC4', 'CIC5', 'BIC0', 'BIC1', 'BIC2', 'BIC3', 'BIC4', 'BIC5', 'MIC0', 'MIC1', 'MIC2', 'MIC3', 'MIC4', 'MIC5', 'ZMIC0', 'ZMIC1', 'ZMIC2', 'ZMIC3', 'ZMIC4', 'ZMIC5', 'Kier1', 'Kier2', 'Kier3', 'nAtomLC', 'nAtomP', 'nAtomLAC', 'MLogP', 'McGowan_Volume', 'MDEC-11', 'MDEC-12', 'MDEC-13', 'MDEC-14', 'MDEC-22', 'MDEC-23', 'MDEC-24', 'MDEC-33', 'MDEC-34', 'MDEC-44', 'MDEO-11', 'MDEO-12', 'MDEO-22', 'MDEN-11', 'MDEN-12', 'MDEN-13', 'MDEN-22', 'MDEN-23', 'MDEN-33', 'MLFER_A', 'MLFER_BH', 'MLFER_BO', 'MLFER_S', 'MLFER_E', 'MLFER_L', 'MPC2', 'MPC3', 'MPC4', 'MPC5', 'MPC6', 'MPC7', 'MPC8', 'MPC9', 'MPC10', 'TPC', 'piPC1', 'piPC2', 'piPC3', 'piPC4', 'piPC5', 'piPC6', 'piPC7', 'piPC8', 'piPC9', 'piPC10', 'TpiPC', 'R_TpiPCTPC', 'PetitjeanNumber', 'nRing', 'n3Ring', 'n4Ring', 'n5Ring', 'n6Ring', 'n7Ring', 'n8Ring', 'n9Ring', 'n10Ring', 'n11Ring', 'n12Ring', 'nG12Ring', 'nFRing', 'nF4Ring', 'nF5Ring', 'nF6Ring', 'nF7Ring', 'nF8Ring', 'nF9Ring', 'nF10Ring', 'nF11Ring', 'nF12Ring', 'nFG12Ring', 'nTRing', 'nT4Ring', 'nT5Ring', 'nT6Ring', 'nT7Ring', 'nT8Ring', 'nT9Ring', 'nT10Ring', 'nT11Ring', 'nT12Ring', 'nTG12Ring', 'nHeteroRing', 'n3HeteroRing', 'n4HeteroRing', 'n5HeteroRing', 'n6HeteroRing', 'n7HeteroRing', 'n8HeteroRing', 'n9HeteroRing', 'n10HeteroRing', 'n11HeteroRing', 'n12HeteroRing', 'nG12HeteroRing', 'nFHeteroRing', 'nF4HeteroRing', 'nF5HeteroRing', 'nF6HeteroRing', 'nF7HeteroRing', 'nF8HeteroRing', 'nF9HeteroRing', 'nF10HeteroRing', 'nF11HeteroRing', 'nF12HeteroRing', 'nFG12HeteroRing', 'nTHeteroRing', 'nT4HeteroRing', 'nT5HeteroRing', 'nT6HeteroRing', 'nT7HeteroRing', 'nT8HeteroRing', 'nT9HeteroRing', 'nT10HeteroRing', 'nT11HeteroRing', 'nT12HeteroRing', 'nTG12HeteroRing', 'nRotB', 'RotBFrac', 'nRotBt', 'RotBtFrac', 'LipinskiFailures', 'topoRadius', 'topoDiameter', 'topoShape', 'GGI1', 'GGI2', 'GGI3', 'GGI4', 'GGI5', 'GGI6', 'GGI7', 'GGI8', 'GGI9', 'GGI10', 'JGI1', 'JGI2', 'JGI3', 'JGI4', 'JGI5', 'JGI6', 'JGI7', 'JGI8', 'JGI9', 'JGI10', 'JGT', 'SpMax_D', 'SpDiam_D', 'SpAD_D', 'SpMAD_D', 'EE_D', 'VE1_D', 'VE2_D', 'VE3_D', 'VR1_D', 'VR2_D', 'VR3_D', 'TopoPSA', 'VABC', 'VAdjMat', 'MWC2', 'MWC3', 'MWC4', 'MWC5', 'MWC6', 'MWC7', 'MWC8', 'MWC9', 'MWC10', 'TWC', 'SRW2', 'SRW3', 'SRW4', 'SRW5', 'SRW6', 'SRW7', 'SRW8', 'SRW9', 'SRW10', 'TSRW', 'MW', 'AMW', 'WTPT-1', 'WTPT-2', 'WTPT-3', 'WTPT-4', 'WTPT-5', 'WPATH', 'WPOL', 'XLogP', 'Zagreb', 'TDB1u', 'TDB2u', 'TDB3u', 'TDB4u', 'TDB5u', 'TDB6u', 'TDB7u', 'TDB8u', 'TDB9u', 'TDB10u', 'TDB1m', 'TDB2m', 'TDB3m', 'TDB4m', 'TDB5m', 'TDB6m', 'TDB7m', 'TDB8m', 'TDB9m', 'TDB10m', 'TDB1v', 'TDB2v', 'TDB3v', 'TDB4v', 'TDB5v', 'TDB6v', 'TDB7v', 'TDB8v', 'TDB9v', 'TDB10v', 'TDB1e', 'TDB2e', 'TDB3e', 'TDB4e', 'TDB5e', 'TDB6e', 'TDB7e', 'TDB8e', 'TDB9e', 'TDB10e', 'TDB1p', 'TDB2p', 'TDB3p', 'TDB4p', 'TDB5p', 'TDB6p', 'TDB7p', 'TDB8p', 'TDB9p', 'TDB10p', 'TDB1i', 'TDB2i', 'TDB3i', 'TDB4i', 'TDB5i', 'TDB6i', 'TDB7i', 'TDB8i', 'TDB9i', 'TDB10i', 'TDB1s', 'TDB2s', 'TDB3s', 'TDB4s', 'TDB5s', 'TDB6s', 'TDB7s', 'TDB8s', 'TDB9s', 'TDB10s', 'TDB1r', 'TDB2r', 'TDB3r', 'TDB4r', 'TDB5r', 'TDB6r', 'TDB7r', 'TDB8r', 'TDB9r', 'TDB10r', 'PPSA-1', 'PPSA-2', 'PPSA-3', 'PNSA-1', 'PNSA-2', 'PNSA-3', 'DPSA-1', 'DPSA-2', 'DPSA-3', 'FPSA-1', 'FPSA-2', 'FPSA-3', 'FNSA-1', 'FNSA-2', 'FNSA-3', 'WPSA-1', 'WPSA-2', 'WPSA-3', 'WNSA-1', 'WNSA-2', 'WNSA-3', 'RPCG', 'RNCG', 'RPCS', 'RNCS', 'THSA', 'TPSA', 'RHSA', 'RPSA', 'GRAV-1', 'GRAV-2', 'GRAV-3', 'GRAVH-1', 'GRAVH-2', 'GRAVH-3', 'GRAV-4', 'GRAV-5', 'GRAV-6', 'LOBMAX', 'LOBMIN', 'MOMI-X', 'MOMI-Y', 'MOMI-Z', 'MOMI-XY', 'MOMI-XZ', 'MOMI-YZ', 'MOMI-R', 'geomRadius', 'geomDiameter', 'geomShape', 'RDF10u', 'RDF15u', 'RDF20u', 'RDF25u', 'RDF30u', 'RDF35u', 'RDF40u', 'RDF45u', 'RDF50u', 'RDF55u', 'RDF60u', 'RDF65u', 'RDF70u', 'RDF75u', 'RDF80u', 'RDF85u', 'RDF90u', 'RDF95u', 'RDF100u', 'RDF105u', 'RDF110u', 'RDF115u', 'RDF120u', 'RDF125u', 'RDF130u', 'RDF135u', 'RDF140u', 'RDF145u', 'RDF150u', 'RDF155u', 'RDF10m', 'RDF15m', 'RDF20m', 'RDF25m', 'RDF30m', 'RDF35m', 'RDF40m', 'RDF45m', 'RDF50m', 'RDF55m', 'RDF60m', 'RDF65m', 'RDF70m', 'RDF75m', 'RDF80m', 'RDF85m', 'RDF90m', 'RDF95m', 'RDF100m', 'RDF105m', 'RDF110m', 'RDF115m', 'RDF120m', 'RDF125m', 'RDF130m', 'RDF135m', 'RDF140m', 'RDF145m', 'RDF150m', 'RDF155m', 'RDF10v', 'RDF15v', 'RDF20v', 'RDF25v', 'RDF30v', 'RDF35v', 'RDF40v', 'RDF45v', 'RDF50v', 'RDF55v', 'RDF60v', 'RDF65v', 'RDF70v', 'RDF75v', 'RDF80v', 'RDF85v', 'RDF90v', 'RDF95v', 'RDF100v', 'RDF105v', 'RDF110v', 'RDF115v', 'RDF120v', 'RDF125v', 'RDF130v', 'RDF135v', 'RDF140v', 'RDF145v', 'RDF150v', 'RDF155v', 'RDF10e', 'RDF15e', 'RDF20e', 'RDF25e', 'RDF30e', 'RDF35e', 'RDF40e', 'RDF45e', 'RDF50e', 'RDF55e', 'RDF60e', 'RDF65e', 'RDF70e', 'RDF75e', 'RDF80e', 'RDF85e', 'RDF90e', 'RDF95e', 'RDF100e', 'RDF105e', 'RDF110e', 'RDF115e', 'RDF120e', 'RDF125e', 'RDF130e', 'RDF135e', 'RDF140e', 'RDF145e', 'RDF150e', 'RDF155e', 'RDF10p', 'RDF15p', 'RDF20p', 'RDF25p', 'RDF30p', 'RDF35p', 'RDF40p', 'RDF45p', 'RDF50p', 'RDF55p', 'RDF60p', 'RDF65p', 'RDF70p', 'RDF75p', 'RDF80p', 'RDF85p', 'RDF90p', 'RDF95p', 'RDF100p', 'RDF105p', 'RDF110p', 'RDF115p', 'RDF120p', 'RDF125p', 'RDF130p', 'RDF135p', 'RDF140p', 'RDF145p', 'RDF150p', 'RDF155p', 'RDF10i', 'RDF15i', 'RDF20i', 'RDF25i', 'RDF30i', 'RDF35i', 'RDF40i', 'RDF45i', 'RDF50i', 'RDF55i', 'RDF60i', 'RDF65i', 'RDF70i', 'RDF75i', 'RDF80i', 'RDF85i', 'RDF90i', 'RDF95i', 'RDF100i', 'RDF105i', 'RDF110i', 'RDF115i', 'RDF120i', 'RDF125i', 'RDF130i', 'RDF135i', 'RDF140i', 'RDF145i', 'RDF150i', 'RDF155i', 'RDF10s', 'RDF15s', 'RDF20s', 'RDF25s', 'RDF30s', 'RDF35s', 'RDF40s', 'RDF45s', 'RDF50s', 'RDF55s', 'RDF60s', 'RDF65s', 'RDF70s', 'RDF75s', 'RDF80s', 'RDF85s', 'RDF90s', 'RDF95s', 'RDF100s', 'RDF105s', 'RDF110s', 'RDF115s', 'RDF120s', 'RDF125s', 'RDF130s', 'RDF135s', 'RDF140s', 'RDF145s', 'RDF150s', 'RDF155s', 'L1u', 'L2u', 'L3u', 'P1u', 'P2u', 'E1u', 'E2u', 'E3u', 'Tu', 'Au', 'Vu', 'Ku', 'Du', 'L1m', 'L2m', 'L3m', 'P1m', 'P2m', 'E1m', 'E2m', 'E3m', 'Tm', 'Am', 'Vm', 'Km', 'Dm', 'L1v', 'L2v', 'L3v', 'P1v', 'P2v', 'E1v', 'E2v', 'E3v', 'Tv', 'Av', 'Vv', 'Kv', 'Dv', 'L1e', 'L2e', 'L3e', 'P1e', 'P2e', 'E1e', 'E2e', 'E3e', 'Te', 'Ae', 'Ve', 'Ke', 'De', 'L1p', 'L2p', 'L3p', 'P1p', 'P2p', 'E1p', 'E2p', 'E3p', 'Tp', 'Ap', 'Vp', 'Kp', 'Dp', 'L1i', 'L2i', 'L3i', 'P1i', 'P2i', 'E1i', 'E2i', 'E3i', 'Ti', 'Ai', 'Vi', 'Ki', 'Di', 'L1s', 'L2s', 'L3s', 'P1s', 'P2s', 'E1s', 'E2s', 'E3s', 'Ts', 'As', 'Vs', 'Ks', 'Ds', 'chembl_id', 'ic50_value', 'log_ic50', 'target_chembl_id', 'target_name'] 🔢 Total columns: 1881
In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, random_split
# === Metrics ===
def medape(y_true, y_pred, epsilon=1e-3):
y_true, y_pred = np.array(y_true), np.array(y_pred)
mask = y_true > epsilon
return np.median(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100 if np.sum(mask) else np.nan
def smape(y_true, y_pred, epsilon=1e-3):
y_true, y_pred = np.array(y_true), np.array(y_pred)
denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
denominator = np.where(denominator < epsilon, epsilon, denominator)
return np.mean(np.abs(y_true - y_pred) / denominator) * 100
# === LSTM ===
class LSTMRegressor(nn.Module):
def __init__(self, input_size, hidden_size=64, num_layers=2, activation='relu'):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.activation = {'relu': nn.ReLU(), 'tanh': nn.Tanh(), 'sigmoid': nn.Sigmoid(), 'leaky_relu': nn.LeakyReLU()}.get(activation)
self.fc = nn.Linear(hidden_size, 1)
def forward(self, x):
out, _ = self.lstm(x)
out = out[:, -1, :]
return self.fc(self.activation(out)) if self.activation else self.fc(out)
def train_model(model, train_loader, val_loader, epochs=200, patience=20, lr=0.001):
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
best_loss, best_state, patience_counter = float('inf'), None, 0
for epoch in range(epochs):
model.train()
for xb, yb in train_loader:
optimizer.zero_grad()
loss = criterion(model(xb), yb)
loss.backward()
optimizer.step()
model.eval()
val_loss = np.mean([criterion(model(xb), yb).item() for xb, yb in val_loader])
if val_loss < best_loss:
best_loss, best_state = val_loss, model.state_dict()
patience_counter = 0
else:
patience_counter += 1
if patience_counter >= patience:
break
model.load_state_dict(best_state)
return model
# === Parameters ===
drop_cols = [
'chembl_id', 'ic50_value', 'log_ic50',
'target_chembl_id', 'target_name', 'Name'
]
metrics = ['RMSE', 'MSE', 'R²', 'MedAPE', 'SMAPE']
# === Prepare Training Set ===
y = padel_train['log_ic50']
# Drop unnecessary columns, clean, and impute missing values
X = (
padel_train
.drop(columns=[col for col in drop_cols if col in padel_train.columns])
.replace(r'^\s*$', np.nan, regex=True)
.apply(pd.to_numeric, errors='coerce')
.fillna(lambda df: df.mean())
)
# Feature selection based on variance
selector = VarianceThreshold(threshold=0.01)
X_selected = selector.fit_transform(X)
print(f"Training set shape (features): {X_selected.shape}, Target shape: {y.shape}")
# === Prepare Blind Set ===
X_blind = (
padel_blind
.drop(columns=[col for col in drop_cols if col in padel_blind.columns])
.replace(r'^\s*$', np.nan, regex=True)
.apply(pd.to_numeric, errors='coerce')
.fillna(lambda df: df.mean())
)
# Apply the same feature selector
X_blind_selected = selector.transform(X_blind)
print(f"Blind set shape (features): {X_blind_selected.shape}")
# === Models ===
models = {
'Random Forest': RandomForestRegressor(**rf_best_params),
'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, random_state=42),
'SVM': SVR(),
'LSTM (PyTorch)': None # will instantiate per fold
}
# === Cross-validation and Blind Evaluation ===
for k in range(2, 11):
print(f"\n--- {k}-Fold Cross Validation ---")
kf = KFold(n_splits=k, shuffle=True, random_state=42)
scores = {metric: {m: [] for m in models} for metric in metrics}
for train_idx, test_idx in kf.split(X_selected):
X_train, X_test = X_selected[train_idx], X_selected[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Classic models
for name, model in models.items():
if name == 'LSTM (PyTorch)':
continue
model.fit(X_train_scaled, y_train)
y_pred = np.maximum(model.predict(X_test_scaled), 0)
scores['RMSE'][name].append(np.sqrt(mean_squared_error(y_test, y_pred)))
scores['MSE'][name].append(mean_squared_error(y_test, y_pred))
scores['R²'][name].append(r2_score(y_test, y_pred))
scores['MedAPE'][name].append(medape(y_test, y_pred))
scores['SMAPE'][name].append(smape(np.expm1(y_test), np.expm1(y_pred)))
# LSTM
X_train_seq = X_train_scaled.reshape(-1, 1, X_train_scaled.shape[1])
X_test_seq = X_test_scaled.reshape(-1, 1, X_test_scaled.shape[1])
train_tensor = torch.tensor(X_train_seq, dtype=torch.float32)
test_tensor = torch.tensor(X_test_seq, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values.reshape(-1, 1), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values.reshape(-1, 1), dtype=torch.float32)
train_ds, val_ds = random_split(TensorDataset(train_tensor, y_train_tensor), [int(0.8*len(train_tensor)), len(train_tensor) - int(0.8*len(train_tensor))])
train_loader = DataLoader(train_ds, batch_size=lstm_best_params['batch_size'], shuffle=True)
val_loader = DataLoader(val_ds, batch_size=lstm_best_params['batch_size'])
lstm = LSTMRegressor(
input_size=train_tensor.shape[2],
hidden_size=lstm_best_params['hidden_size'],
num_layers=lstm_best_params['num_layers'],
activation=lstm_best_params['activation']
)
lstm = train_model(lstm, train_loader, val_loader, lr=lstm_best_params['learning_rate'])
lstm.eval()
with torch.no_grad():
y_pred_lstm = torch.clamp(lstm(test_tensor), min=0).squeeze().numpy()
scores['RMSE']['LSTM (PyTorch)'].append(np.sqrt(mean_squared_error(y_test, y_pred_lstm)))
scores['MSE']['LSTM (PyTorch)'].append(mean_squared_error(y_test, y_pred_lstm))
scores['R²']['LSTM (PyTorch)'].append(r2_score(y_test, y_pred_lstm))
scores['MedAPE']['LSTM (PyTorch)'].append(medape(y_test, y_pred_lstm))
scores['SMAPE']['LSTM (PyTorch)'].append(smape(np.expm1(y_test), np.expm1(y_pred_lstm)))
# === Fold-wise Average Report ===
print("\nAverage Cross-Validation Performance:")
perf_df = pd.DataFrame({
'Model': list(models.keys()),
'Avg RMSE': [np.mean(scores['RMSE'][m]) for m in models],
'Avg MSE': [np.mean(scores['MSE'][m]) for m in models],
'Avg R²': [np.mean(scores['R²'][m]) for m in models],
'Avg MedAPE (%)': [np.mean(scores['MedAPE'][m]) for m in models],
'Avg SMAPE (%)': [np.mean(scores['SMAPE'][m]) for m in models]
})
print(perf_df.to_string(index=False, float_format="%.4f"))
# === Plot ===
fig, axes = plt.subplots(1, 5, figsize=(30, 6))
for i, metric in enumerate(perf_df.columns[1:]):
sns.barplot(data=perf_df, x='Model', y=metric, ax=axes[i], palette='Set2')
axes[i].set_title(f"{metric} ({k}-Fold)")
axes[i].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
# === Blind Set Evaluation ===
print(f"\n--- Evaluating Blind Set After {k}-Fold ---")
X_blind_scaled = scaler.transform(X_blind_selected)
blind_metrics = {metric: {} for metric in metrics}
for name, model in models.items():
if name == 'LSTM (PyTorch)':
X_blind_seq = X_blind_scaled.reshape(-1, 1, X_blind_scaled.shape[1])
X_blind_tensor = torch.tensor(X_blind_seq, dtype=torch.float32)
lstm.eval()
with torch.no_grad():
y_blind_pred = torch.clamp(lstm(X_blind_tensor), min=0).squeeze().numpy()
else:
y_blind_pred = np.maximum(model.predict(X_blind_scaled), 0)
blind_metrics['RMSE'][name] = np.sqrt(mean_squared_error(padel_blind['log_ic50'], y_blind_pred))
blind_metrics['MSE'][name] = mean_squared_error(padel_blind['log_ic50'], y_blind_pred)
blind_metrics['R²'][name] = r2_score(padel_blind['log_ic50'], y_blind_pred)
blind_metrics['MedAPE'][name] = medape(padel_blind['log_ic50'], y_blind_pred)
blind_metrics['SMAPE'][name] = smape(np.expm1(padel_blind['log_ic50']), np.expm1(y_blind_pred))
blind_df = pd.DataFrame({
'Model': list(models.keys()),
'RMSE': [blind_metrics['RMSE'][m] for m in models],
'MSE': [blind_metrics['MSE'][m] for m in models],
'R²': [blind_metrics['R²'][m] for m in models],
'MedAPE (%)': [blind_metrics['MedAPE'][m] for m in models],
'SMAPE (%)': [blind_metrics['SMAPE'][m] for m in models],
})
print(blind_df.to_string(index=False, float_format="%.4f"))
# === Blind Set Performance Plot ===
fig, axes = plt.subplots(1, 5, figsize=(30, 6))
for i, metric in enumerate(blind_df.columns[1:]): # Skip 'Model'
sns.barplot(data=blind_df, x='Model', y=metric, ax=axes[i], palette='Set1')
axes[i].set_title(f"{metric} (Blind Set - {k}-Fold)")
axes[i].tick_params(axis='x', rotation=45)
axes[i].set_ylabel('')
axes[i].set_xlabel('')
plt.tight_layout()
plt.show()
Training set shape (features): (2340, 1356), Target shape: (2340,)
Blind set shape (features): (586, 1356)
--- 2-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.8513 0.7251 0.7110 17.9757 83.4071
Gradient Boosting 0.8520 0.7260 0.7104 19.3290 84.5180
SVM 0.8601 0.7400 0.7050 18.5874 84.8082
LSTM (PyTorch) 0.8895 0.7913 0.6842 19.6600 85.8534
--- Evaluating Blind Set After 2-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8730 0.7621 0.7049 18.7240 84.2277
Gradient Boosting 0.8766 0.7685 0.7024 18.9597 84.2176
SVM 0.8669 0.7516 0.7090 17.6902 83.6497
LSTM (PyTorch) 0.9569 0.9157 0.6454 19.8928 87.0778
--- 3-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.8137 0.6627 0.7359 17.0504 80.5795
Gradient Boosting 0.8143 0.6640 0.7354 17.5325 82.0657
SVM 0.8250 0.6816 0.7285 18.1769 82.8730
LSTM (PyTorch) 0.8711 0.7589 0.6970 18.8863 84.4106
--- Evaluating Blind Set After 3-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8386 0.7032 0.7277 17.4101 80.2232
Gradient Boosting 0.8520 0.7260 0.7189 17.0870 81.5805
SVM 0.8256 0.6816 0.7361 16.7889 80.4963
LSTM (PyTorch) 0.8846 0.7826 0.6969 18.4272 80.5206
--- 4-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.8044 0.6479 0.7414 16.9085 80.4870
Gradient Boosting 0.8131 0.6619 0.7358 17.5495 81.8134
SVM 0.8194 0.6722 0.7317 17.9651 82.5797
LSTM (PyTorch) 0.8418 0.7090 0.7162 17.6983 82.0275
--- Evaluating Blind Set After 4-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8263 0.6828 0.7356 16.8877 78.9719
Gradient Boosting 0.8329 0.6938 0.7313 17.8203 80.7984
SVM 0.8189 0.6706 0.7403 16.3375 79.5923
LSTM (PyTorch) 0.8840 0.7815 0.6974 18.6283 81.1667
--- 5-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7994 0.6415 0.7438 16.6058 79.6129
Gradient Boosting 0.7946 0.6335 0.7467 17.4364 81.0315
SVM 0.8092 0.6571 0.7376 17.5046 82.0490
LSTM (PyTorch) 0.8338 0.6957 0.7218 17.7607 81.8287
--- Evaluating Blind Set After 5-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8195 0.6716 0.7400 16.8672 78.4379
Gradient Boosting 0.8411 0.7074 0.7261 16.9818 79.3822
SVM 0.8140 0.6626 0.7434 16.4415 79.1539
LSTM (PyTorch) 0.8905 0.7931 0.6929 18.8551 83.1659
--- 6-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7945 0.6329 0.7474 16.8265 79.5653
Gradient Boosting 0.8044 0.6492 0.7410 17.4319 81.3851
SVM 0.8056 0.6511 0.7400 17.3922 81.8520
LSTM (PyTorch) 0.8416 0.7096 0.7158 17.6455 82.7010
--- Evaluating Blind Set After 6-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8170 0.6675 0.7415 16.4331 77.6954
Gradient Boosting 0.8296 0.6882 0.7335 17.9441 81.0340
SVM 0.8087 0.6540 0.7467 16.0969 78.8238
LSTM (PyTorch) 0.8832 0.7801 0.6979 18.3897 82.6454
--- 7-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7946 0.6339 0.7460 16.3470 79.2097
Gradient Boosting 0.7998 0.6417 0.7426 16.9436 80.9946
SVM 0.8039 0.6487 0.7395 17.1662 81.6090
LSTM (PyTorch) 0.8222 0.6763 0.7278 16.9108 80.9146
--- Evaluating Blind Set After 7-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8059 0.6495 0.7485 15.8206 77.2111
Gradient Boosting 0.8274 0.6846 0.7349 18.8968 80.7818
SVM 0.8060 0.6496 0.7484 15.9965 78.9845
LSTM (PyTorch) 0.8808 0.7757 0.6996 18.2042 81.7837
--- 8-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7907 0.6272 0.7487 16.0637 78.9184
Gradient Boosting 0.7961 0.6357 0.7449 17.0644 80.3324
SVM 0.8025 0.6464 0.7403 17.4287 81.6472
LSTM (PyTorch) 0.8353 0.6988 0.7186 17.0027 82.0261
--- Evaluating Blind Set After 8-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8082 0.6532 0.7471 16.5399 78.1717
Gradient Boosting 0.8166 0.6668 0.7418 17.5269 79.4840
SVM 0.8039 0.6463 0.7497 16.2436 78.8541
LSTM (PyTorch) 0.8315 0.6915 0.7322 18.1958 81.6388
--- 9-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7966 0.6372 0.7454 16.3645 79.3389
Gradient Boosting 0.7932 0.6314 0.7472 17.3816 81.3334
SVM 0.8002 0.6427 0.7428 17.2345 81.4064
LSTM (PyTorch) 0.8407 0.7078 0.7160 17.6891 82.1518
--- Evaluating Blind Set After 9-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8096 0.6555 0.7462 16.6125 77.8286
Gradient Boosting 0.8296 0.6882 0.7335 18.2068 80.1138
SVM 0.8031 0.6450 0.7503 16.2288 78.7813
LSTM (PyTorch) 0.8428 0.7104 0.7249 17.6260 80.5237
--- 10-Fold Cross Validation ---
Average Cross-Validation Performance:
Model Avg RMSE Avg MSE Avg R² Avg MedAPE (%) Avg SMAPE (%)
Random Forest 0.7878 0.6236 0.7501 15.9826 78.6798
Gradient Boosting 0.7911 0.6291 0.7475 16.7672 80.3359
SVM 0.7968 0.6373 0.7441 17.2957 81.6198
LSTM (PyTorch) 0.8185 0.6715 0.7299 17.1095 80.9563
--- Evaluating Blind Set After 10-Fold ---
Model RMSE MSE R² MedAPE (%) SMAPE (%)
Random Forest 0.8044 0.6470 0.7494 16.6074 77.3725
Gradient Boosting 0.8180 0.6691 0.7409 18.8911 80.1414
SVM 0.7998 0.6397 0.7523 15.9673 78.4854
LSTM (PyTorch) 0.8675 0.7525 0.7086 17.7688 81.6895
In [71]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
# ==================================
# 1. Define drop columns
# ==================================
drop_cols = ['chembl_id', 'ic50_value', 'log_ic50',
'target_chembl_id', 'target_name', 'Name']
# ==================================
# 2. Prepare training data
# ==================================
y = padel_train['log_ic50']
X = padel_train.drop(columns=[col for col in drop_cols if col in padel_train.columns])
# Clean: remove empty strings → NaN → numeric → fill NaN with column mean
X = X.replace(r'^\s*$', np.nan, regex=True).apply(pd.to_numeric, errors='coerce')
X = X.fillna(X.mean())
# ==================================
# 3. Feature selection (low variance)
# ==================================
selector = VarianceThreshold(threshold=0.01)
X_selected = pd.DataFrame(
selector.fit_transform(X),
columns=X.columns[selector.get_support()],
index=X.index
)
print(f"Original features: {X.shape[1]}, After selection: {X_selected.shape[1]}")
# ==================================
# 4. Standard scaling
# ==================================
scaler = StandardScaler()
X_scaled = pd.DataFrame(
scaler.fit_transform(X_selected),
columns=X_selected.columns,
index=X_selected.index
)
# ==================================
# 5. Random Forest with 10-Fold CV
# ==================================
kf = KFold(n_splits=10, shuffle=True, random_state=42)
r2_scores, mse_scores = [], []
for train_idx, test_idx in kf.split(X_scaled):
X_train, X_test = X_scaled.iloc[train_idx], X_scaled.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2_scores.append(r2_score(y_test, y_pred))
mse_scores.append(mean_squared_error(y_test, y_pred))
print(f"Average R²: {np.mean(r2_scores):.4f}")
print(f"Average MSE: {np.mean(mse_scores):.4f}")
# ==================================
# 6. Feature importance
# ==================================
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
'Feature': X_scaled.columns,
'Importance': importances
}).sort_values(by='Importance', ascending=False)
top_n = 100
top_features = feature_importance_df.head(top_n)
# ==================================
# 7. Plot feature importance
# ==================================
plt.figure(figsize=(10, 12))
sns.barplot(x='Importance', y='Feature', data=top_features, palette='viridis')
plt.title(f"Top {top_n} Important Features (Random Forest)", fontsize=14)
plt.tight_layout()
plt.show()
Original features: 1875, After selection: 1356 Average R²: 0.7490 Average MSE: 0.6264
In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import KFold
# === Parameters ===
drop_cols = [
'chembl_id', 'ic50_value', 'log_ic50',
'target_chembl_id', 'target_name', 'Name'
]
output_plot_path = "/Users/janat/Desktop/ppimic50pred/Padel_important_features_correlation.png"
n_splits = 5 # CV folds
# === Prepare Training Set ===
y = padel_train['log_ic50']
X = (
padel_train
.drop(columns=[col for col in drop_cols if col in padel_train.columns])
.replace(r'^\s*$', np.nan, regex=True)
.apply(pd.to_numeric, errors='coerce')
.fillna(lambda df: df.mean())
)
# === Remove low-variance features ===
selector = VarianceThreshold(threshold=0.01)
X_var = selector.fit_transform(X)
selected_cols = X.columns[selector.get_support()]
print(f"Training set after variance filter: {X_var.shape}, Target shape: {y.shape}")
# === Cross-validation for Feature Importance ===
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
feature_importances = np.zeros(len(selected_cols))
for fold, (train_idx, val_idx) in enumerate(kf.split(X_var)):
X_train, y_train = X_var[train_idx], y.iloc[train_idx]
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
feature_importances += rf.feature_importances_
# Average importances across folds
feature_importances /= n_splits
importances = pd.Series(feature_importances, index=selected_cols)
# === Filter features with importance > 0.00 ===
important_features = importances[importances > 0.00].sort_values(ascending=False)
print(f"Selected {len(important_features)} features with importance > 0.00")
# === Correlation Heatmap of Important Features ===
X_imp = X[important_features.index]
corr_matrix = X_imp.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(
corr_matrix,
annot=False, cmap='coolwarm',
cbar_kws={'label': 'Correlation'}
)
plt.title("Correlation Heatmap of Important Features", fontsize=14)
plt.tight_layout()
# Save plot
plt.savefig(output_plot_path, dpi=300)
plt.close()
print(f"Correlation heatmap saved to: {output_plot_path}")
Training set after variance filter: (2340, 1356), Target shape: (2340,) Selected 1354 features with importance > 0.00 Correlation heatmap saved to: /Users/janat/Desktop/ppimic50pred/Padel_important_features_correlation.png
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: